import os as os
            
            #Data Cleaning & Visualization
            import pandas as pd
            import seaborn as sns
            import numpy as np
            
            import matplotlib.pyplot as plt
            #ML Regression, Decision Trees
            from sklearn.preprocessing import StandardScaler, LabelEncoder
            from sklearn.model_selection import train_test_split
            from sklearn.linear_model import LogisticRegression
            from sklearn.metrics import roc_curve
            from sklearn.metrics import roc_auc_score
            from sklearn.metrics import confusion_matrix
            from sklearn.model_selection import train_test_split
            
            #ML PCA & Clustering
            from scipy.cluster import hierarchy
            import seaborn as sns
            from sklearn import decomposition, preprocessing, cluster, tree
            import pydotplus
            from yellowbrick.cluster.silhouette import SilhouetteVisualizer
            
            
            #ML Recommendation Algorithm 
            from sklearn.feature_extraction.text import TfidfVectorizer
            from sklearn.metrics.pairwise import cosine_similarity

array([2.61155893e-01, 2.09808747e-01, 1.27262369e-01, 1.19521228e-01,
                   1.01198399e-01, 9.06161518e-02, 8.05875806e-02, 9.81195975e-03,
                   3.76713129e-05])

array([ 0.19279606,  0.36279544,  0.36266011, -0.52126245, -0.25986923,
                   -0.53505052, -0.01655183,  0.10777476, -0.2495432 ])

<matplotlib.legend.Legend at 0x7fcd28f9d3c8>

<matplotlib.legend.Legend at 0x7fcd28e5d5c0>

<AxesSubplot:xlabel='PC1', ylabel='PC2'>

<AxesSubplot:>

array([1, 1, 1, ..., 2, 2, 2], dtype=int32)

0    1628
            1    2063
            2    1684
            3      65
            dtype: int64

<AxesSubplot:xlabel='PC1', ylabel='PC2'>

BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead


            import os as os
            
            #Data Cleaning & Visualization
            import pandas as pd
            import seaborn as sns
            import numpy as np
            
            import matplotlib.pyplot as plt
            #ML Regression, Decision Trees
            from sklearn.preprocessing import StandardScaler, LabelEncoder
            from sklearn.model_selection import train_test_split
            from sklearn.linear_model import LogisticRegression
            from sklearn.metrics import roc_curve
            from sklearn.metrics import roc_auc_score
            from sklearn.metrics import confusion_matrix
            from sklearn.model_selection import train_test_split
            
            #ML PCA & Clustering
            from scipy.cluster import hierarchy
            import seaborn as sns
            from sklearn import decomposition, preprocessing, cluster, tree
            import pydotplus
            from yellowbrick.cluster.silhouette import SilhouetteVisualizer
            
            
            #ML Recommendation Algorithm 
            from sklearn.feature_extraction.text import TfidfVectorizer
            from sklearn.metrics.pairwise import cosine_similarity


            def preprocessing(df):  
                '''
                This function does the main cleaning to create a base dataFrame called 'df`
                '''
                #drop duplicated columns
                drop_column=['description2', 'Unnamed: 0', 'link', 'author_y', 'title','url', 'title_1']
                df=df.drop(drop_column,axis=1)
                
                #renaming
                column_names=['author', 'title', 'description_1', 'duration_seg',
                   'date_released', 'keywords', 'description_2', 'date_recorded', 'views',
                   'likes']
            
                df.columns=column_names
            
                #
                #MODIFYING COLUMN: date_recorded
                #
                df['date_recorded']= pd.to_datetime(df['date_recorded'], format='%B %Y')
              
                ##separate data into new Column
                list_months=[]
                list_years=[]
                for i in range(df.shape[0]):
                    list_months.append(df['date_recorded'][i].month)
                    list_years.append(df['date_recorded'][i].year)
                df['date_recorded_year']=list_years
                df['date_recorded_month']=list_months
            
                #
                #MODIFYING COLUMN: date_released
                #
                column='date_released'
                df[column]= pd.to_datetime(df[column], format='%Y-%m-%d %H:%M:%S')
            
                ##separate data into new Column
                list_months=[]
                list_years=[]
                list_hours=[]
                list_minutes=[]
                for i in range(df.shape[0]):
                    list_months.append(df[column][i].month)
                    list_years.append(df[column][i].year)
                    list_hours.append(df[column][i].hour)
                    list_minutes.append(df[column][i].minute)
                df[column+'_year']=list_years
                df[column+'_month']=list_months
                df[column+'_hour']=list_hours
                df[column+'_minute']=list_minutes
            
                #
                #MODIFYING COLUMN: 'keywords'
                #
                df_key=df.keywords
                i=0
                df_result=pd.DataFrame()
                ##transforming line into string
                for line in df_key:
                    line=(str(line).replace("[","").replace("]","").split(','))
                    new_line=[]
                    ##removing additional spaces in words and converting the into lower case
                    for word in line:
                        word=word.lower().replace(' ', '')[1:-1]
                        new_line.append(word)
                    ##transforming line into string
                    new_line=str(new_line).replace("[","").replace("]","")
                    ##writting line into dataframe
                    df_result.at[i,'keywords2']=new_line
                    i=i+1
              
                df=pd.concat([df,df_result], axis=1)
            
                #drop initial columns
                drop_columns=['date_recorded','date_released', 'keywords']
                df=df.drop(drop_columns, axis=1)
                
                return(df)


            def create_dummies_file(df):
                '''
                This function does: 
                1) convert df.keywords into dummy columns
                2) Adds dummy columns to 'df'
                3) creates a file called 'keywords.csv' in order to manually map new categories from keywords
                '''
                #converting keywords into dummy columns
                df2=df.keywords2.str.get_dummies(',')
            
                #joining with df
                df=pd.concat([df,df2], axis=1)
                
                #removing 'ted' column
                column_to_drop=df.columns[362]
                df2=df.drop(column_to_drop, axis=1)
            
                #counting dummies and creating file to rename categories
                dummy_columns=pd.Series(np.arange(15,349,1))[1:]
                df_dummies=df.iloc[:,dummy_columns].sum().reset_index()
                df_dummies.columns=['keyword', 'sum']
                df2=df_dummies.copy()
                (df2
                 .groupby(['keyword'])
                 .agg({'sum':'sum'})
                )
                df2=df2.sort_values(by='sum', ascending=False)
                cwd=os.getcwd()
                df2.to_csv(cwd+'/keywords.csv')
                return df


            def dummy_data(df):
                '''
                This function takes the keyword_categories.csv file and creates a new dataframe 'df_dummies' to analyze the keywords
                '''
                # cwd=os.getcwd()
                # categories=pd.read_csv(cwd+'/keywords_categories.csv')
                #for github
                categories=pd.read_csv('https://github.com/aaas24/code_library/raw/main/ted_talks/2_preprocessing/keywords_categories.csv')
            
                #transforming categories
                new_cat=(categories.columns.values.tolist())
                dic={key: None for key in new_cat}
            
                ##creating dictionary with categories file
                for column in range (0,categories.shape[1]):
                    dic_values=[]
                    key=new_cat[column]
                    for row in range (0,categories.shape[0]):
                        value=categories.iloc[row,column]
                        if value is np.nan:
                            pass
                        else:
                            value=value.replace(' ', '')[1:-1]
                            dic_values.append(value)
                    dic.update({key:dic_values})
            
                ##adding column to df with 
                dummy_columns=pd.concat([df.iloc[:,16:349], df[['likes', 'views']]], axis=1)
                df_dummies=dummy_columns.iloc[:,:-2].sum().reset_index()
                df_dummies.columns=['sub_category', 'num_talks']
                
                #adding categories to subcategories
                list_categories=[]
                for i in range (0, len(set(df_dummies['sub_category']))):
                    keyword=df_dummies['sub_category'][i][2:-1]
                    ###find category of keyword in dictionary
                    for key, value_list in dic.items():
                        for x in value_list:
                            if keyword==x:
                                category=key
                    ###add category to list
                    list_categories.append(category)
                
                ##add list_categories to df
                df_dummies['category']=list_categories
            
                #add num likes and views
                list_likes=[]
                list_views=[]
                for row in range (0,df_dummies.shape[0]):
                    subcategory=df_dummies.iloc[row,0]
                    df2=dummy_columns[[subcategory,'likes','views']]
                    df2.columns=['A', 'likes','views']
                    df3=(df2
                         .query('A>0')
                         .groupby('A')
                         .agg({'likes': ['sum'], 'views':['sum']})
                        )
                    
                    list_likes.append(df3.iloc[0,0])
                    list_views.append(df3.iloc[0,1])
                #add lists to df_dummies
                df_dummies['likes']=list_likes
                df_dummies['views']=list_views
                return df_dummies


            def main():
                #load data from datasets drive
                datasets='/Volumes/Datasets'
                raw_data=pd.read_csv(os.path.join(datasets,'ted_talks/1_raw_data/final_raw_data.csv'))
                df=raw_data.copy()
                df=preprocessing(df)
                df=create_dummies_file(df)
                df_dummies=dummy_data(df)
                return(df, df_dummies)


            df, df_dummies=main()
            df.head(5)


            df_dummies.head(5)


            #writting data to disk
            df.to_csv(os.getcwd()+'/5_final_data/df.csv')
            df_dummies.to_csv(os.getcwd()+'/5_final_data/df_dummies.csv')


            df.head(3)


            #dropping categorical columns
            df_model=df.drop(['author','title','description_1', 'description_2', 'keywords2'], axis=1).iloc[:,:9]
            df_model.head(3)


            import pandas as pd
            import seaborn as sns
            import numpy as np
            import matplotlib.pyplot as plt
            import os as os
            
            #ML Regression, Decision Trees
            from sklearn.preprocessing import StandardScaler, LabelEncoder
            from sklearn.model_selection import train_test_split
            from sklearn.linear_model import LogisticRegression
            from sklearn.metrics import roc_curve
            from sklearn.metrics import roc_auc_score
            from sklearn.metrics import confusion_matrix
            from sklearn.model_selection import train_test_split
            
            #ML PCA
            from scipy.cluster import hierarchy
            import seaborn as sns
            from sklearn import decomposition, preprocessing, cluster, tree
            import pydotplus
            from yellowbrick.cluster.silhouette import SilhouetteVisualizer
            
            
            X = df_model
            std = preprocessing.StandardScaler()
            X_std = pd.DataFrame(std.fit_transform(X), columns=X.columns)
            X_std


            pca = decomposition.PCA()
            pca_X = pd.DataFrame(pca.fit_transform(X_std), columns=[f'PC{i+1}' for i in range(len(X.columns))])
            pca_X


            #variance or relevance of PCAs. In this case the first 3 hold ~50% of representation of the data
            pca.explained_variance_ratio_

array([2.61155893e-01, 2.09808747e-01, 1.27262369e-01, 1.19521228e-01,
                   1.01198399e-01, 9.06161518e-02, 8.05875806e-02, 9.81195975e-03,
                   3.76713129e-05])


            # Components
            # First component is .36 * Views + .36 * likes + 0.19 * Dur ... etc
            pca.components_[0]

array([ 0.19279606,  0.36279544,  0.36266011, -0.52126245, -0.25986923,
                   -0.53505052, -0.01655183,  0.10777476, -0.2495432 ])


            # What columns make up the components 1 & 2?
            # 1 - Views & Likes
            # 2 - Recorded & Released Year
            (pd.DataFrame(pca.components_, columns=X.columns)
             .iloc[:2]
             .plot.bar()
             .legend(bbox_to_anchor=(1,1)))

<matplotlib.legend.Legend at 0x7fcd28f9d3c8>


            # What columns make up the components 3 & 4?
            (pd.DataFrame(pca.components_, columns=X.columns)
             .iloc[2:4]
             .plot.bar()
             .legend(bbox_to_anchor=(1,1)))

<matplotlib.legend.Legend at 0x7fcd28e5d5c0>


            # Plot with Seaborn
            x='PC1'
            y='PC2'
            val='date_released_month'
            sns.scatterplot(x=x, y=y, 
                            data=pca_X.assign(val=X[val]), 
                            hue='val')

<AxesSubplot:xlabel='PC1', ylabel='PC2'>


            inerts = []
            for i in range(2, 20):
                k = cluster.KMeans(n_clusters=i, random_state=42)
                k.fit(X_std)
                inerts.append(k.inertia_)
                
            pd.Series(inerts).plot()

<AxesSubplot:>


            start, end = 2, 10
            cols = 2
            rows = ((end - start) // cols)
            fix, axes = plt.subplots(rows, cols, figsize=(12,8))
            axes = axes.reshape(cols * rows)
            for i, k in enumerate(range(start, end), 0):
                ax = axes[i]
                sil = SilhouetteVisualizer(cluster.KMeans(n_clusters=k, random_state=42), ax=ax)
                sil.fit(X_std)
                sil.finalize()
            plt.tight_layout()


            # Try another mechanism
            fig, ax = plt.subplots(figsize=(10,8))
            hierarchy.dendrogram(hierarchy.linkage(X_std, method='ward'),
                                truncate_mode='lastp', p=20, show_contracted=True)
            pass  # here to hide return value of above


            # going to choose 4 clusters
            k9 = cluster.KMeans(n_clusters=4, random_state=42)
            k9.fit(X_std)
            labels = k9.predict(X_std)


            labels

array([1, 1, 1, ..., 2, 2, 2], dtype=int32)


            X.assign(label=labels)


            (X.assign(label=labels)
              .groupby('label')
              .agg(['mean', 'var'])
              .T
            )


            # how many in each cluster?
            pd.Series(labels).value_counts().sort_index()

0    1628
            1    2063
            2    1684
            3      65
            dtype: int64


            # Add coloring to aid impact to clusters
            (X.assign(label=labels)
              .groupby('label')
              .mean()
              .T
             .style.background_gradient(cmap='RdBu', axis=1)
            )


            # describe a column for each label
            (X.assign(label=labels)
              .groupby('label')
              .date_recorded_year
              .describe()
            )


            # describe a label in a cluster
            (X.assign(label=labels)
             .query('label == 0')
             .describe()
            )


            # Plot with Seaborn
            cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
            fig, ax = plt.subplots(figsize=(10,8))
            sns.scatterplot(x='PC1', y='PC2', 
                            data=pca_X.assign(label=labels),
                            cmap='Pastel',
                            hue='label', ax=ax)

<AxesSubplot:xlabel='PC1', ylabel='PC2'>


            from bokeh.io import output_notebook
            from bokeh import models, palettes, transform
            from bokeh.plotting import figure, show
            
            def bokeh_scatter(
                x,
                y,
                data,
                hue=None,
                label_cols=None,
                size=None,
                legend=None,
                alpha=0.5,
            ):
                """
                x - x column name to plot
                y - y column name to plot
                data - pandas dataframe
                hue - column name to color by (numeric)
                legend - column name to label by
                label_cols - columns to use in tooltip (None all in dataframe)
                size - size of points in screen space unigs
                alpha - transparency
                """
                output_notebook()
                circle_kwargs = {}
                if legend:
                    circle_kwargs["legend"] = legend
                if size:
                    circle_kwargs["size"] = size
                if hue:
                    color_seq = data[hue]
                    mapper = models.LinearColorMapper(
                        palette=palettes.viridis(256),
                        low=min(color_seq),
                        high=max(color_seq),
                    )
                    circle_kwargs[
                        "fill_color"
                    ] = transform.transform(hue, mapper)
                ds = models.ColumnDataSource(data)
                if label_cols is None:
                    label_cols = data.columns
                tool_tips = sorted(
                    [
                        (x, "@{}".format(x))
                        for x in label_cols
                    ],
                    key=lambda tup: tup[0],
                )
                hover = models.HoverTool(
                    tooltips=tool_tips
                )
                fig = figure(
                    tools=[
                        hover,
                        "pan",
                        "zoom_in",
                        "zoom_out",
                        "reset",
                    ],
                    toolbar_location="below",
                )
            
                fig.circle(
                    x,
                    y,
                    source=ds,
                    alpha=alpha,
                    **circle_kwargs
                )
                show(fig)
                return fig
            
            res = bokeh_scatter("PC1","PC2", 
                                data=pd.concat([pca_X, X], axis=1).assign(label=labels), hue='label', size=10,
                                label_cols=list(X.columns)+['label'],
                               legend='label')

BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead


            # 1 - Understanding the distribution of the likes to apply on the groupby below
            y=1000
            df_graph=df.views.apply(lambda x: round(x/y,0))
            df_graph.describe()

count     5440.000000
            mean      2063.652941
            std       3569.598818
            min          1.000000
            25%        670.750000
            50%       1300.000000
            75%       2100.000000
            max      72000.000000
            Name: views, dtype: float64


            # Top 25 Most Liked videos by taking into account 75% percentile as the cutting point
            y_var='likes'
            df_grap = (
                (df.groupby(['title','author','date_recorded_year','views'])[y_var].sum().reset_index())
                .sort_values([y_var],ascending=[False])
                ).reset_index()
            df_grap=df_grap.drop('index', axis=1)
            df_grap = df_grap[df_grap[y_var] > 65000]
            print ('Top 25 Liked Videos')
            df_grap.head(25)

Top 25 Liked Videos


            #obtaining dataset 
            datasets='/Volumes/Datasets'
            raw_data=pd.read_csv(os.path.join(datasets,'ted_talks/1_raw_data/ted-talks-website.csv'))
            df2=raw_data.copy()
            df2


            #Modifiable variables
            x_var1='author'
            y_var='title'
            # build data
            df_graph=(
                (df2.groupby([x_var1])[y_var].count().reset_index()
                ).sort_values([y_var],ascending=[False])
            )
            df_graph=df_graph[df_graph[y_var]>1]
            print('')
            print(df_graph.describe()) 
            print('')
            print(df_graph.head(20))

                        title
            count  536.000000
            mean     2.858209
            std      2.788234
            min      2.000000
            25%      2.000000
            50%      2.000000
            75%      3.000000
            max     45.000000
            
                                author  title
            148           Alex Gendler     45
            1781      Iseult Gillespie     33
            2845           Matt Walker     18
            152         Alex Rosenthal     15
            1283         Elizabeth Cox     13
            1338            Emma Bryce     12
            962          Daniel Finkel     11
            2216         Juan Enriquez     11
            933             Dan Finkel      9
            1655          Hans Rosling      9
            4338      Wendy De La Rosa      9
            1609             Greg Gage      9
            3029          Mona Chalabi      9
            1962            Jen Gunter      9
            544             Bill Gates      8
            2729         Marco Tempest      7
            31                  TED-Ed      7
            943           Dan Kwartler      7
            1821  Jacqueline Novogratz      6
            2213               Joy Lin      6


            #outliers
            df_graph2=(
                df_graph.title
                .reset_index()
                .set_index('index')
            )
            df_graph2.plot.box()

<AxesSubplot:>


            #Modifiable variables
            y_var='views'
            x_var1='author'
            x_var2='likes'
            
            #build data
            df_graph = (
                (df2.groupby([x_var1, x_var2])[y_var].sum().reset_index())
                .sort_values([y_var],ascending=[False])
                ).reset_index().head(20)
            df_graph=df_graph.drop('index', axis=1)
            df_graph


            df_graph=df.iloc[:, :14]
            fig, ax = plt.subplots(figsize=(8,8))
            sns.heatmap(df_graph.corr(), cmap='RdBu', vmin=-1, vmax=1, annot=True, square=True, ax=ax)

<AxesSubplot:>


            df_graph=(
                        df_dummies.
                        groupby(['category'])
                        .agg({'likes':['sum'],'views':['sum'], 'num_talks':['sum']})
                        
            )
            df_graph.columns=['likes', 'views', 'num_talks']
            df_graph=df_graph.sort_values(by=['num_talks'], ascending=False)
            
            #plot
            sns.scatterplot(data=df_graph, x="likes", y="views", size="num_talks", legend=True, hue='category', alpha=0.5, sizes=(40, 400)) 
            plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=10)
            sns.axes_style({
                'axes.facecolor': 'white',
                'axes.edgecolor': 'black',
                'axes.grid': False,
                'figure.facecolor': 'white',
                 'grid.color': 'white',
                 'grid.linestyle': '-',
                 'font.sans-serif': 'Arial',
                 'grid.color': '#ffffff'
            
            })
            sns.set(rc={"figure.figsize":(9 , 9)}) #(width,height)
            plt.show()


            df_graph=(
                        df_dummies.
                        groupby(['sub_category'])
                        .agg({'likes':['sum'],'views':['sum'], 'num_talks':['sum']})
                        
            )
            df_graph.columns=['likes', 'views', 'num_talks']
            df_graph=(df_graph
                .sort_values(by=['num_talks'], ascending=False)
                .iloc[1:20,:]
            )
            
            # df_graph
            # plot
            sns.scatterplot(data=df_graph, x="likes", y="views", size="num_talks", legend=True, hue='sub_category', alpha=0.5, sizes=(40, 400)) 
            plt.legend(bbox_to_anchor=(1, 1), loc='upper left', fontsize=10)
            sns.axes_style({
                'axes.facecolor': 'white',
                'axes.edgecolor': 'black',
                'axes.grid': False,
                'figure.facecolor': 'white',
                 'grid.color': 'white',
                 'grid.linestyle': '-',
                 'font.sans-serif': 'Arial',
                 'grid.color': '#ffffff'
            
            })
            sns.set(rc={"figure.figsize":(9 ,9)}) #(width,height)
            plt.show()


            # Are there outliers
            df.duration_seg.plot.box()

<AxesSubplot:>


            #relationship between duration and likes
            df_graph=df
            sns.relplot(x='duration_seg', y='likes', data=df_graph, alpha=.1)

<seaborn.axisgrid.FacetGrid at 0x7fcd89258588>


            #Insight: during pandemic years (2020-2021) 
            df2=df[df.date_recorded_year==2019]
            df3=df[df.date_recorded_year==2020]
            df_graph=pd.concat([df2,df3 ])
            sns.relplot(x='duration_seg', y='likes', data=df_graph, col='date_recorded_year', col_wrap=2, alpha=.1)

<seaborn.axisgrid.FacetGrid at 0x7fcd593306d8>


            title='Number of Videos Recorded per Year'
            y_label='Num Videos'
            
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)
            ax=df.date_recorded_year.hist(alpha=0.6)
            plt.plot(ax=ax)
            plt.grid(axis='x')
            ax.set_facecolor("white")
            ax.set_ylabel(y_label)
            plt.grid(axis='y', color='black', alpha=.2)
            plt.title(title, ha='center', fontsize='xx-large')

Text(0.5, 1.0, 'Number of Videos Recorded per Year')


            # Are there outliers
            title='Most Videos were Recorded Between 2012-2020'
            ax=df.date_recorded_year.plot.box()
            plt.plot(ax=ax)
            plt.grid(axis='x')
            ax.set_facecolor("white")
            plt.grid(axis='y', color='black', alpha=.2)
            plt.title(title, ha='center', fontsize='xx-large')

Text(0.5, 1.0, 'Most Videos were Recorded Between 2012-2020')


            data=df[df.date_recorded_year>2000]
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)
            
            #plotting first histogram
            ax=(data
                .groupby(['date_recorded_month'])
                .likes
                .count()
                .plot(x='date_recorded_month', kind = 'bar',alpha=0.6, ax=ax,) 
            )
            #plotting second hidtogram
            ax=(data
                .groupby(['date_released_month'])
                .likes
                .count()
                .plot(x='date_released_month', kind = 'bar',alpha=0.5, ax=ax, color='#76725e') 
            )
            #improving labes
            ax.set_xticks(ticks=range(0,12,1))  
            ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Ago', 'Sep', 'Oct', 'Nov', 'Dec'])
            ax.set_xlabel('')
            ax.set_ylabel('Count Videos ')
            #styling grid, leyend and title
            plt.title('Monthly Videos Recorded vs Released', ha='center', fontsize='xx-large')
            plt.legend(["Recorded", "Released"], loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.1), borderaxespad=2.6, facecolor="white")
            ax.set_facecolor("white")
            plt.grid(axis='y', color='black', alpha=.2)


            df.duration_seg.describe()

count    5440.000000
            mean      711.460846
            std       463.023031
            min         0.000000
            25%       361.750000
            50%       686.000000
            75%       934.000000
            max      9915.000000
            Name: duration_seg, dtype: float64

df2


            df_graph=df2.loc[:,['duration_seg']]
            df_graph


            # #what constitude a good video based on likes?
            y=1000
            df_graph=df.likes.apply(lambda x: round(x/y,0))
            df_graph.describe()

count    5440.000000
            mean       62.666912
            std       107.730958
            min         0.000000
            25%        20.000000
            50%        41.000000
            75%        65.000000
            max      2100.000000
            Name: likes, dtype: float64


            #verifying no NAN in data feeding model
            df[df.likes.isnull()==True]


            #create target
            
            #we define TARGET a well performing video if it is above 75% percentile. So the model should predict if a video will
            #perform above 75% percentile
            threshold= np.percentile(df.likes, 75)
            
            #create target column
            df['target']=[1 if x>threshold else 0 for x in df.likes]


            #drop multicolinearity columns
            df_d=df.drop(['likes', 'views'], axis=1)


            #drop text columns
            df_d=df_d.drop(['author', 'title', 'description_1', 'description_2', 'keywords2'], axis=1)


            data=df_d.copy()
            data.head(3)


            #Balance data
            data.target.value_counts()

0    4098
            1    1342
            Name: target, dtype: int64


            positive_labels = data[data.target==1]
            num_positive_labels = positive_labels.shape[0]
            num_positive_labels

1342


            negative_labels = data[data.target==0].sample(num_positive_labels)
            negative_labels.shape

(1342, 357)


            balanced_data =  positive_labels.append(negative_labels)
            balanced_data.target.value_counts()

1    1342
            0    1342
            Name: target, dtype: int64


            ## Splitting data into test splits
            y = balanced_data.pop('target')
            X = balanced_data


            X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3)
            X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size = 0.33)
            X_train.head()


            # fit a model
            clf = LogisticRegression(penalty='l2').fit(X_train, y_train)
            # predict probabilities
            predictions = clf.predict_proba(X_test)[:, 1]

/Users/alialvarez/opt/anaconda3/envs/clustering/lib/python3.6/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
            STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
            
            Increase the number of iterations (max_iter) or scale the data as shown in:
                https://scikit-learn.org/stable/modules/preprocessing.html
            Please also refer to the documentation for alternative solver options:
                https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
              extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)


            # Feature Importance
            feature_importance = abs(clf.coef_[0])
            feature_importance = 100.0 * (feature_importance / feature_importance.max())
            sorted_idx = np.argsort(feature_importance)
            pos = np.arange(sorted_idx.shape[0]) + .5
            
            featfig = plt.figure(figsize=(10, 15))
            featax = featfig.add_subplot(1, 1, 1)
            featax.barh(pos, feature_importance[sorted_idx], align='center')
            featax.set_yticks(pos)
            featax.set_yticklabels(np.array(X.columns)[sorted_idx], fontsize=8)
            
            plt.show()


            #Zooming into the top 20 features (positives and negatives)
            data=abs(pd.Series(clf.coef_[0], index=X.columns.values)
              .sort_values()
              .iloc[[0,1,2,3,4,5, 6, 7, 8, 9,-9, -8, -7, -6, -5, -4, -3, -2, -1]]
              # .plot.barh()
            )
            data.sort_values(ascending=False).plot.barh()

<AxesSubplot:>


            # Predict probabilities given test data
            y_pred = clf.predict_proba(X_test)
            pred_reg=y_pred


            # calculate scores
            auc = roc_auc_score(y_test, predictions)
            
            # calculate roc curves
            fpr, tpr, _ = roc_curve(y_test, predictions)
            
            plt.figure(figsize=(15, 10))
            # plot horizontal line 
            plt.plot([0, 1], [0, 1], linestyle='--')
            # plot the roc curve for the model
            plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc)
            # axis labels
            plt.xlabel('FPR')
            plt.ylabel('TPR')
            # show the legend
            plt.legend(loc='lower right')
            # show the plot
            plt.show()


            from sklearn.tree import DecisionTreeClassifier
            
            dt_model = DecisionTreeClassifier(max_depth=10)
            
            print(dt_model)
            
            dt_model = dt_model.fit(X_train,y_train)
            pred_dt = dt_model.predict_proba(X_valid)[:, 1]

DecisionTreeClassifier(max_depth=10)


            from sklearn.metrics import classification_report
            pred_dt_binary = dt_model.predict(X_valid)
            print(classification_report(y_valid, pred_dt_binary))

              precision    recall  f1-score   support
            
                       0       0.55      0.60      0.58       268
                       1       0.57      0.52      0.55       272
            
                accuracy                           0.56       540
               macro avg       0.56      0.56      0.56       540
            weighted avg       0.56      0.56      0.56       540


            from sklearn.ensemble import RandomForestClassifier
            
            rf_model = RandomForestClassifier()
            print(rf_model)
            
            rf_model = rf_model.fit(X_train, y_train)
            pred_rf = rf_model.predict_proba(X_valid)[:, 1]
            print(classification_report(y_valid, pred_rf.round(0)))

RandomForestClassifier()
                          precision    recall  f1-score   support
            
                       0       0.65      0.58      0.61       268
                       1       0.63      0.69      0.65       272
            
                accuracy                           0.64       540
               macro avg       0.64      0.63      0.63       540
            weighted avg       0.64      0.64      0.63       540


            #Feature Component
            from sklearn.inspection import permutation_importance
            rf_model.feature_importances_
            plt.barh(X.columns.values, rf_model.feature_importances_)

<BarContainer object of 356 artists>


            (pd.Series(rf_model.feature_importances_, index=X.columns.values)
              .sort_values(ascending=False)
              .iloc[:10]
              .plot.barh()
            )

<AxesSubplot:>


            #code to fix error taken from: https://stackoverflow.com/questions/43579180/feature-names-must-be-unique-xgboost
            X_train = X_train.loc[:,~X_train.columns.duplicated()]
            X_valid = X_valid.loc[:,~X_valid.columns.duplicated()]


            from xgboost import XGBClassifier
            
            xgb_model = XGBClassifier()
            
            xgb_model = xgb_model.fit(X_train, y_train)
            pred_xgb = xgb_model.predict_proba(X_valid)[:, 1]

/Users/alialvarez/opt/anaconda3/envs/clustering/lib/python3.6/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
              warnings.warn(label_encoder_deprecation_msg, UserWarning)

[11:29:07] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.


            #Feature Component
            from xgboost import plot_importance
            # plot feature importance
            plot_importance(xgb_model)
            plt.show()


            df_graph=(pd.Series(xgb_model.feature_importances_, index=X.columns.values)
              .sort_values(ascending=False)
              .iloc[:40]
              .sort_values(ascending=True)
              .plot.barh()
            )
            
            #improving labels
            ax.set_xlabel('Importance')
            ax.set_ylabel('Features ')
            
            
            #styling grid, leyend and title
            plt.title('Feature Importance', ha='center', fontsize='x-large')
            ax.set_facecolor("white")
            plt.grid(axis='y', color='black', alpha=.2)


            y_pred = clf.predict_proba(X_test)
            pred_reg=y_pred[:, 1]


            def create_roc_plot(name, predictions):
                if name == 'Logistic':
                    auc = roc_auc_score(y_test, predictions).round(2)
                    fpr, tpr, _ = roc_curve(y_test, predictions)
                else: 
                    auc = roc_auc_score(y_valid, predictions).round(2)
                    fpr, tpr, _ = roc_curve(y_valid, predictions)
            
                plt.figure(figsize=(5, 4))
                plt.plot([0, 1], [0, 1], linestyle='--')  # plot horizontal line 
                plt.plot(fpr, tpr, label='{} AUC = {}'.format(name, auc)) # plot the roc curve for the model
                plt.xlabel('FPR')
                plt.ylabel('TPR')
                plt.legend(loc='lower right')  # show the legend
                plt.show() # show the plot
                
                return None


            create_roc_plot('Logistic', pred_reg)
            create_roc_plot('Decision Tree', pred_dt)
            create_roc_plot('Random Forest', pred_rf)
            create_roc_plot('XGBoost', pred_xgb)


            df.duration_seg.describe()

count    5440.000000
            mean      711.460846
            std       463.023031
            min         0.000000
            25%       361.750000
            50%       686.000000
            75%       934.000000
            max      9915.000000
            Name: duration_seg, dtype: float64


            step=60
            bar=list(np.arange(0, 934,step))
            # bar.append(9915)
            
            df2=pd.concat([X,y], axis=1)
            df_graph=df2.loc[:,['duration_seg', 'target']]
            df_graph=(df_graph    
              .assign(bin=pd.cut(df_graph.duration_seg, bar))
              .groupby(['bin', 'target'])
              .size()
              .unstack()
            )
            
            dfgraph_y=df_graph.iloc[:,1]
            dfgraph_n=df_graph.iloc[:,0]
            
            #improving graph
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)
            
            #plotting first histogram
            ax=(dfgraph_y.plot( kind = 'bar',alpha=0.6, ax=ax,))
            
            #plotting second histogram
            ax=(dfgraph_n.plot( kind = 'bar',alpha=0.5, ax=ax, color='#76725e'))
            
            #improving labels
            ax.set_xlabel('Minutes')
            ax.set_ylabel('Count Videos ')
            ax.set_xticks(ticks=range(0,len(bar)-1,1))  
            ax.set_xticklabels(range(1,len(bar),1))
            
            #styling grid, leyend and title
            plt.title('Duration in Seg by Performing and Not Performing Videos', ha='center', fontsize='xx-large')
            plt.legend(["Performing", "Not Performing"], loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.1), borderaxespad=2.6, facecolor="white")
            ax.set_facecolor("white")
            plt.grid(axis='y', color='black', alpha=.2)


            df2=pd.concat([X,y], axis=1)
            df_graph=df2.loc[:,['date_released_month', 'target']]
            dfgraph_y=df_graph[df_graph.target==1]
            dfgraph_n=df_graph[df_graph.target==0]
            
            
            
            #improving graph
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)
            
            # #plotting first histogram
            ax=(dfgraph_y.groupby(['date_released_month'])
                .agg({'target':['sum']})
                .plot( kind = 'bar',alpha=0.6, ax=ax,) 
            )
            # #plotting second histogram
            ax=(dfgraph_n.groupby(['date_released_month'])
                .agg({'target':['count']})
                .plot( kind = 'bar',alpha=0.5, ax=ax, color='#76725e') 
            )
            
            # # #improving labels
            
            ax.set_xticks(ticks=range(0,12,1))  
            ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Ago', 'Sep', 'Oct', 'Nov', 'Dec'])
            ax.set_xlabel('')
            ax.set_ylabel('Count Videos ')
            
            #styling grid, leyend and title
            plt.title('Released Month by Performing and Not Performing Videos', ha='center', fontsize='xx-large')
            plt.legend(["Performing", "Not Performing"], loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.1), borderaxespad=2.6, facecolor="white")
            ax.set_facecolor("white")
            plt.grid(axis='y', color='black', alpha=.2)


            df2=pd.concat([X,y], axis=1)
            df_graph=df2.loc[:,['date_released_year', 'target']]
            dfgraph_y=df_graph[df_graph.target==1]
            dfgraph_n=df_graph[df_graph.target==0]
            
            
            
            #improving graph
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(13,5), sharex=True)
            
            # #plotting first histogram
            ax=(dfgraph_y.groupby(['date_released_year'])
                .agg({'target':['sum']})
                .plot( kind = 'bar',alpha=0.6, ax=ax,) 
            )
            # #plotting second histogram
            ax=(dfgraph_n.groupby(['date_released_year'])
                .agg({'target':['count']})
                .plot( kind = 'bar',alpha=0.5, ax=ax, color='#76725e') 
            )
            
            # # #improving labels
            ax.set_xlabel('')
            ax.set_ylabel('Count Videos ')
            
            #styling grid, leyend and title
            plt.title('Released Year by Performing and Not Performing Videos', ha='center', fontsize='xx-large')
            plt.legend(["Performing", "Not Performing"], loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.1), borderaxespad=2.6, facecolor="white")
            ax.set_facecolor("white")
            plt.grid(axis='y', color='black', alpha=.2)


            #define variables to use in model
            review=df.description_1
            title=df.title


            #Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
            tfidf = TfidfVectorizer(stop_words='english')#max_features=5000
            
            #Construct the required TF-IDF matrix by fitting and transforming the data
            tfidf_matrix = tfidf.fit_transform(review)
            
            #Output the shape of tfidf_matrix
            tfidf_matrix.shape
            
            #create matrix
            cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


            #extract indices
            indices = (pd.Series(df.index, index=title)
                .reset_index()
                .drop_duplicates(subset=['title'], keep='first')
                ).set_index('title')
                      
            indices.columns=['index']
            indices=indices.squeeze()
            indices

title
            Climate action needs new frontline leadership                  0
            The dark history of the overthrow of Hawaii                    1
            Why play is essential for business                             2
            Why is China appointing judges to combat climate change?       3
            Cement's carbon problem -- and 2 ways to fix it                4
                                                                        ... 
            Let's teach religion -- all religion -- in schools          5431
            Letting go of God                                           5433
            Do schools kill creativity?                                 5436
            Greening the ghetto                                         5437
            Averting the climate crisis                                 5439
            Name: index, Length: 3014, dtype: int64


            def get_recommendations(title, cosine_sim=cosine_sim):
                # Get the index of the talk that matches the title
                idx = indices[title]
            
            #     Get the pairwsie similarity scores of all talks with that movie
                sim_scores = list(enumerate(cosine_sim[idx]))
                
            #     Sort the talk based on the similarity scores
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            
            #     Remove duplicates scores 
                sim_scores=pd.Series(v[0] for v in sim_scores).drop_duplicates()
            
                # Get the talk indices
                recommendations=(
                    df.title.iloc[sim_scores]
                    .drop_duplicates()
                    [1:11]
                    .reset_index()
                ).drop('index', axis=1)
             
                # Return the top 10 most similar values
                return recommendations


            talk_liked='Can machines read your emotions?'
            
            indices[talk_liked]

2732


            # change display option to be able to see ful title name
            pd.set_option('display.max_colwidth', None)
            get_recommendations(talk_liked)


            #compare results of recommendation engine
            df_graph=df.query('title.str.contains("machines", "emotions")', engine='python')
            df_graph[['author', 'title', 'likes']].drop_duplicates(subset='title').sort_values(by=['likes'], ascending=False)

	author	title	description_1	duration_seg	description_2	views	likes	date_recorded_year	date_recorded_month	date_released_year	...	'women'	'work'	'ted'
0	Ozawa Bineshi Albert	Climate action needs new frontline leadership	"We can't rely on those who created climate ch...	834	"We can't rely on those who created climate ch...	404000	12000	2021	12	2022	...	0	0	1
1	Sydney Iaukea	The dark history of the overthrow of Hawaii	"On January 16th, 1895, two men arrived at Lil...	0	"On January 16th, 1895, two men arrived at Lil...	214000	6400	2022	2	2022	...	1	0	1
2	Martin Reeves	Why play is essential for business	"To thrive in today's competitive economy, you...	665	"To thrive in today's competitive economy, you...	412000	12000	2021	9	2022	...	0	1	1
3	James K. Thornton	Why is China appointing judges to combat clima...	"Why is China appointing thousands of judges t...	695	"Why is China appointing thousands of judges t...	427000	12000	2021	10	2022	...	0	0	1
4	Mahendra Singhi	Cement's carbon problem -- and 2 ways to fix it	"Cement is vital to modernizing all kinds of i...	671	"Cement is vital to modernizing all kinds of i...	2400	72	2021	10	2022	...	0	0	1

	sub_category	num_talks	category	likes	views
0	'3dprinting'	9	technology	201574	6655100
1	'activism'	352	values & emotions	21752759	714057797
2	'addiction'	20	health	1870500	60982000
3	'africa'	197	global	9097799	299541000
4	'aging'	93	society	8152092	269034199

	author	title	description_1	duration_seg	description_2	views	likes	date_recorded_year	date_recorded_month	date_released_year	...	'women'	'work'	'ted'
0	Ozawa Bineshi Albert	Climate action needs new frontline leadership	"We can't rely on those who created climate ch...	834	"We can't rely on those who created climate ch...	404000	12000	2021	12	2022	...	0	0	1
1	Sydney Iaukea	The dark history of the overthrow of Hawaii	"On January 16th, 1895, two men arrived at Lil...	0	"On January 16th, 1895, two men arrived at Lil...	214000	6400	2022	2	2022	...	1	0	1
2	Martin Reeves	Why play is essential for business	"To thrive in today's competitive economy, you...	665	"To thrive in today's competitive economy, you...	412000	12000	2021	9	2022	...	0	1	1

	duration_seg	views	likes	date_recorded_year	date_recorded_month	date_released_year	date_released_month	date_released_hour	date_released_minute
0	0.264675	-0.464984	-0.470418	1.309927	1.591529	1.446482	-1.209561	-0.513606	0.637580
1	-1.536697	-0.518216	-0.522405	1.528134	-1.311085	1.446482	-1.209561	-0.256235	-0.721544
2	-0.100352	-0.462743	-0.470418	1.309927	0.720745	1.446482	-1.209561	-0.513606	1.122981
3	-0.035554	-0.458540	-0.470418	1.309927	1.011006	1.446482	-1.209561	-0.513606	0.831740
4	-0.087392	-0.577500	-0.581151	1.309927	1.011006	1.446482	-1.209561	-0.513606	0.491959
...	...	...	...	...	...	...	...	...	...
5435	0.605942	3.624367	3.670033	-1.963185	-1.311085	-2.359267	0.221125	2.060102	-0.818624
5436	0.977448	19.594017	18.913574	-1.963185	-1.311085	-2.359267	-0.065012	2.060102	-0.818624
5437	0.873772	0.234319	0.235130	-1.963185	-1.311085	-2.359267	-0.065012	2.060102	-0.818624
5438	0.873772	-0.017834	-0.024809	-1.963185	-1.311085	-2.359267	-0.065012	2.060102	-0.818624
5439	0.573543	0.430437	0.430084	-1.963185	-1.311085	-2.359267	-0.065012	2.060102	-0.818624

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9
0	-2.353050	0.640153	0.284316	-0.603109	0.929410	-1.291875	0.239075	-0.030687	0.003502
1	-1.731059	0.280292	-1.858844	-1.413271	-1.016366	0.526591	1.118925	-0.092696	0.003599
2	-2.317451	0.717196	0.179404	-1.127605	0.461949	-0.588923	-0.108814	-0.019253	0.005456
3	-2.306187	0.672507	0.132764	-0.911835	0.568249	-0.854332	0.092875	-0.021647	0.008290
4	-2.314708	0.465819	-0.036364	-0.827110	0.516955	-0.914731	0.337036	-0.014700	0.002438
...	...	...	...	...	...	...	...	...	...
5435	5.811723	2.436950	-1.161030	0.723981	0.493394	0.399537	-0.999472	-0.290728	-0.032605
5436	17.210024	21.196511	-2.006792	2.509708	1.380970	-0.442098	-0.713892	-0.125093	0.480695
5437	3.392499	-1.689732	-0.931235	0.032325	0.647645	0.457366	-0.976476	-0.341491	-0.000921
5438	3.206750	-1.997994	-0.916213	-0.001125	0.638960	0.469690	-0.979716	-0.344335	0.004586
5439	3.476469	-1.421349	-1.059695	0.079071	0.399186	0.400620	-1.037660	-0.353432	-0.000107

TED Talks¶

STEP 1 - PREPROCESSING THE DATA ¶

STEP 2 - DATA EXPLORATION ¶

Principal Component Analysis (PCA)¶

Clustering ¶

Clusters¶

STEP 3 - DATA ANALYSIS ¶

Which are the most viewed videos?¶

Are there any authors that have presented more than once?¶

Is there a sustancial difference between liked and viewed?. If so, which are the top videos on each?¶

Which content categories are most viewed?¶

Could the duration of the video affect the likebility of the videos?¶

When was the video published?¶

Understanding date_released_month as key feature¶

Understanding video_duration as key feature¶

STEP 4 - MACHINE LEARNING ¶

A) Predicting model for if a video will perform well ¶

Preparing data for models¶

Models¶

Logistic Regression ¶

Decision Tree ¶

Random Forest ¶

XGBoost ¶

Comparing ML Models ¶

Understanding video_duration as key feature¶

Understanding date_released_month as key feature¶

Understanding date_released_year as key feature¶

B) ML Recommendation of Ted Talk ¶

Conclusion¶

General Findings¶

* Which are the most viewed videos?¶

* Is there a sustancial difference between liked and viewed?¶

* Are there any authors that have presented more than once?¶

* Which content categories are most viewed?¶

* Could the duration of the video affect the likebility of the videos?¶

* When was the video published?¶

Other techniques we can apply¶

Areas of improvements:¶

	label	0	1	2	3
duration_seg	mean	6.350369e+02	6.455351e+02	8.631271e+02	7.886462e+02
duration_seg	var	1.554876e+05	2.034898e+05	2.523900e+05	1.738138e+05
views	mean	1.579249e+06	1.829965e+06	1.887791e+06	2.616923e+07
views	var	2.991364e+12	4.134593e+12	3.745353e+12	1.764240e+14
likes	mean	4.808912e+04	5.563381e+04	5.721366e+04	7.927385e+05
likes	var	2.814619e+09	3.858025e+09	3.489601e+09	1.495301e+11
date_recorded_year	mean	2.017660e+03	2.017270e+03	2.009742e+03	2.012277e+03
date_recorded_year	var	5.702002e+00	6.409831e+00	1.294749e+01	1.195337e+01
date_recorded_month	mean	8.097666e+00	6.384876e+00	5.200713e+00	5.215385e+00
date_recorded_month	var	7.532558e+00	1.283434e+01	1.070241e+01	1.101538e+01
date_released_year	mean	2.018230e+03	2.018567e+03	2.010543e+03	2.013262e+03
date_released_year	var	4.012342e+00	3.833456e+00	5.029017e+00	1.528990e+01
date_released_month	mean	9.693489e+00	3.415414e+00	6.343824e+00	5.630769e+00
date_released_month	var	2.869116e+00	3.324437e+00	1.088053e+01	1.079904e+01
date_released_hour	mean	1.113759e+01	1.091032e+01	1.094240e+01	1.152308e+01
date_released_hour	var	6.582163e+00	7.480315e+00	3.244410e+01	2.059712e+01
date_released_minute	mean	3.214742e+01	3.283713e+01	1.764311e+01	2.761538e+01
date_released_minute	var	3.840004e+02	3.897591e+02	3.564008e+02	3.713341e+02

label	0	1	2	3
duration_seg	635.036855	645.535143	863.127078	788.646154
views	1579248.831081	1829964.674746	1887790.967933	26169230.769231
likes	48089.123464	55633.806108	57213.657957	792738.461538
date_recorded_year	2017.660319	2017.270480	2009.741686	2012.276923
date_recorded_month	8.097666	6.384876	5.200713	5.215385
date_released_year	2018.229730	2018.566651	2010.543349	2013.261538
date_released_month	9.693489	3.415414	6.343824	5.630769
date_released_hour	11.137592	10.910325	10.942399	11.523077
date_released_minute	32.147420	32.837130	17.643112	27.615385

	count	mean	std	min	25%	50%	75%	max
label
0	1628.0	2017.660319	2.387886	2009.0	2016.0	2018.0	2020.0	2021.0
1	2063.0	2017.270480	2.531764	2009.0	2015.0	2017.0	2019.0	2022.0
2	1684.0	2009.741686	3.598263	1970.0	2009.0	2010.0	2012.0	2021.0
3	65.0	2012.276923	3.457364	2004.0	2010.0	2013.0	2015.0	2019.0

	duration_seg	views	likes	date_recorded_year	date_recorded_month	date_released_year	date_released_month	date_released_hour	date_released_minute	label
count	1628.000000	1.628000e+03	1628.000000	1628.000000	1628.000000	1628.000000	1628.000000	1628.000000	1628.000000	1628.0
mean	635.036855	1.579249e+06	48089.123464	2017.660319	8.097666	2018.229730	9.693489	11.137592	32.147420	0.0
std	394.319134	1.729556e+06	53052.986698	2.387886	2.744551	2.003083	1.693846	2.565573	19.595928	0.0
min	0.000000	1.200000e+03	37.000000	2009.000000	1.000000	2010.000000	5.000000	5.000000	0.000000	0.0
25%	327.750000	4.147500e+05	12000.000000	2016.000000	6.000000	2017.000000	8.000000	9.000000	12.000000	0.0
50%	612.000000	1.300000e+06	41000.000000	2018.000000	9.000000	2018.000000	10.000000	10.000000	36.000000	0.0
75%	831.000000	1.900000e+06	59000.000000	2020.000000	10.000000	2020.000000	11.000000	14.000000	50.000000	0.0
max	4125.000000	1.400000e+07	435000.000000	2021.000000	12.000000	2021.000000	12.000000	20.000000	59.000000	0.0

	title	author	date_recorded_year	views	likes
0	Do schools kill creativity?	Sir Ken Robinson	2006	72000000	2100000
1	The self-organizing computer course	Shimon Schocken	2012	64000000	1900000
2	Inside the mind of a master procrastinator	Tim Urban	2016	60000000	1800000
3	How great leaders inspire action	Simon Sinek	2009	57000000	1700000
4	The power of vulnerability	Brené Brown	2010	56000000	1700000
5	How to speak so that people want to listen	Julian Treasure	2013	49000000	1400000
6	My philosophy for a happy life	Sam Berns	2013	43000000	1300000
7	The next outbreak? We're not ready	Bill Gates	2015	43000000	1300000
8	What makes a good life? Lessons from the longe...	Robert Waldinger	2015	41000000	1200000
9	Could a Saturn moon harbor life?	Carolyn Porco	2009	37000000	1100000
10	Looks aren't everything. Believe me, I'm a model.	Cameron Russell	2012	38000000	1100000
11	Why people believe they can't draw	Graham Shaw	2015	37000000	1100000
12	The orchestra in my mouth	Tom Thum	2013	34000000	1000000
13	How to spot a liar	Pamela Meyer	2011	31000000	953000
14	The art of misdirection	Apollo Robbins	2013	31000000	933000
15	Why I must speak out about climate change	James Hansen	2012	30000000	915000
16	manspace	Sam Martin	2009	30000000	909000
17	10 young Indian artists to watch	Ravin Agrawal	2009	28000000	857000
18	For more wonder, rewild the world	George Monbiot	2013	28000000	855000
19	How to stop screwing yourself over	Mel Robbins	2011	28000000	855000
20	The future we're building -- and boring	Elon Musk	2017	28000000	849000
21	My stroke of insight	Jill Bolte Taylor	2008	28000000	844000
22	A demo of wireless electricity	Eric Giler	2009	28000000	843000
23	Strange answers to the psychopath test	Jon Ronson	2012	27000000	838000
24	The science of skin color	Angela Koine Flynn	2015	26000000	789000

	title	author	date	views	likes	link
0	Climate action needs new frontline leadership	Ozawa Bineshi Albert	December 2021	404000	12000	https://ted.com/talks/ozawa_bineshi_albert_cli...
1	The dark history of the overthrow of Hawaii	Sydney Iaukea	February 2022	214000	6400	https://ted.com/talks/sydney_iaukea_the_dark_h...
2	How play can spark new ideas for your business	Martin Reeves	September 2021	412000	12000	https://ted.com/talks/martin_reeves_how_play_c...
3	Why is China appointing judges to combat clima...	James K. Thornton	October 2021	427000	12000	https://ted.com/talks/james_k_thornton_why_is_...
4	Cement's carbon problem — and 2 ways to fix it	Mahendra Singhi	October 2021	2400	72	https://ted.com/talks/mahendra_singhi_cement_s...
...	...	...	...	...	...	...
5435	The best stats you've ever seen	Hans Rosling	February 2006	15000000	458000	https://ted.com/talks/hans_rosling_the_best_st...
5436	Do schools kill creativity?	Sir Ken Robinson	February 2006	72000000	2100000	https://ted.com/talks/sir_ken_robinson_do_scho...
5437	Greening the ghetto	Majora Carter	February 2006	2900000	88000	https://ted.com/talks/majora_carter_greening_t...
5438	Simplicity sells	David Pogue	February 2006	2000000	60000	https://ted.com/talks/david_pogue_simplicity_s...
5439	Averting the climate crisis	Al Gore	February 2006	3600000	109000	https://ted.com/talks/al_gore_averting_the_cli...

	author	title	description_1	duration_seg	description_2	views	likes	date_recorded_year	date_recorded_month	date_released_year	...	'water'	'weather'	'windenergy'	'women'	'womeninbusiness'	'work'	'work-lifebalance'	'writing'	'youth'	'ted'
9	Dwan Reece	The origins of blackface and Black stereotypes	"If you're wondering why blackface -- mimickin...	0	"If you're wondering why blackface -- mimickin...	584000	17000	2019	3	2022	...	0	0	0	0	0	0	0	0	0	1
115	Mona Chalabi	How accurate is the weather forecast?	"No one remembers when you're right, but no on...	118	"No one remembers when you're right, but no on...	1200000	36000	2019	8	2021	...	0	1	0	0	0	0	0	0	0	1
294	Robert A. Belle	The emotions behind your money habits	'Your money habits reveal a lot about you: you...	523	'Your money habits reveal a lot about you: you...	1600000	48000	2019	3	2021	...	0	0	0	0	0	0	0	0	0	1
340	TED Audio Collective	Introducing Body Stuff with Dr. Jen Gunter	"Should I do a juice cleanse? Do I really need...	130	"Should I do a juice cleanse? Do I really need...	2400000	72000	2019	4	2021	...	0	0	0	0	0	0	0	0	0	1
359	Tai Simpson	The intergenerational wisdom woven into Indige...	"The way we behave politically, socially, econ...	1059	"The way we behave politically, socially, econ...	1100000	34000	2019	4	2021	...	0	0	0	0	0	0	0	0	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1700	Maryn McKenna	Antibiotics changed our food. Here's how to ch...	'In this sobering talk, health writer Maryn Mc...	749	'In this sobering talk, health writer Maryn Mc...	2900000	89000	2019	1	2019	...	0	0	0	0	0	0	0	0	0	1
1705	Chiki Sarkar	How India's smartphone revolution is creating ...	"India has the second largest population of an...	606	"India has the second largest population of an...	1600000	49000	2019	1	2019	...	0	0	0	0	0	0	0	1	0	1
1707	Chiki Sarkar	How India's smartphone revolution is creating ...	"India has the second largest population of an...	606	"India has the second largest population of an...	12000000	366000	2019	1	2019	...	0	0	0	0	0	0	0	1	0	1
1709	Wendy De La Rosa	3 psychological tricks to help you save money	"We all want to save more money -- but overall...	350	"We all want to save more money -- but overall...	3600000	109000	2019	1	2019	...	0	0	0	0	0	1	1	0	0	1
1712	Eva-Maria Geigl	The history of the world according to cats	"In ancient times, wildcats were fierce carniv...	260	"In ancient times, wildcats were fierce carniv...	6600000	198000	2019	1	2019	...	0	0	0	0	0	0	0	0	0	1

Model	Y_Prediction
Logistic Regression	0.56
Simple Tree	0.59
Random Forest	0.64
X-Boost	0.68

	duration_seg	date_recorded_year	date_recorded_month	date_released_year	date_released_month	date_released_hour	date_released_minute	...	'writing'	'ted'
4033	1029	2012	9	2012	11	10	15	...	0	1
1548	875	2019	3	2019	3	10	19	...	0	1
1422	78	2019	6	2019	6	13	30	...	1	1
1827	310	2018	7	2018	10	13	16	...	0	1
1719	180	2018	12	2018	12	13	46	...	0	1

	title
0	How computer memory works
1	What's it like to be a robot?
2	Why incompetent people think they're amazing
3	The wonderful and terrifying implications of computers that can learn
4	The rise of personal robots
5	What is the Heisenberg Uncertainty Principle?
6	Make robots smarter
7	Older people are happier
8	Diversity in harmony
9	Why do your knuckles pop?

	author	title	likes
2977	Raffaello D'Andrea	Meet the dazzling flying machines of the future	302000
2835	Anthony Goldbloom	The jobs we'll lose to machines -- and the ones we won't	85000
2571	Garry Kasparov	Don't fear intelligent machines. Work with them	56000
2752	Tim Leberecht	4 ways to build a human company in the age of machines	50000
3894	Erik Brynjolfsson	The key to growth? Race with the machines	41000
2473	Radhika Nagpal	What intelligent machines can learn from a school of fish	39000
2732	Kostas Karpouzis	Can machines read your emotions?	9300
3138	Markus Lorenz	Industry 4.0: how intelligent machines will transform everything we know	6400