{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [ "remove-cell" ] }, "outputs": [], "source": [ "import sys\n", "import os\n", "if not any(path.endswith('textbook') for path in sys.path):\n", " sys.path.append(os.path.abspath('../../..'))\n", "from textbook_utils import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [ "remove-cell" ] }, "outputs": [], "source": [ "df = pd.read_csv('data/fake_news.csv', parse_dates=['timestamp'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "(sec:fake_news_exploring)=\n", "# Exploring the Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The dataset of news articles we're exploring is just one part of the larger FakeNewsNet dataset. As such, the original paper doesn't provide detailed information about our subset of data.\n", "So, to better understand the data, we must explore it ourselves.\n", "\n", "Before starting exploratory data analysis, we apply our standard practice of splitting the data into training and test sets. We perform EDA using only the train set:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "df['label'] = (df['label'] == 'fake').astype(int)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " df[['timestamp', 'baseurl', 'content']], df['label'],\n", " test_size=0.25, random_state=42,\n", ")" ] }, { "cell_type": "code", "execution_count": 324, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | timestamp | \n", "baseurl | \n", "content | \n", "
---|---|---|---|
164 | \n", "2019-01-04 19:25:46 | \n", "worldnewsdailyreport.com | \n", "Chinese lunar rover finds no evidence of Ameri... | \n", "
28 | \n", "2016-01-12 21:02:28 | \n", "occupydemocrats.com | \n", "Virginia Republican Wants Schools To Check Chi... | \n", "
\n", " | trump | \n", "clinton | \n", "state | \n", "vote | \n", "... | \n", "swamp | \n", "cnn | \n", "the | \n", "label | \n", "
---|---|---|---|---|---|---|---|---|---|
164 | \n", "False | \n", "False | \n", "True | \n", "False | \n", "... | \n", "False | \n", "False | \n", "True | \n", "1 | \n", "
28 | \n", "False | \n", "False | \n", "False | \n", "False | \n", "... | \n", "False | \n", "False | \n", "True | \n", "1 | \n", "
708 | \n", "False | \n", "False | \n", "True | \n", "True | \n", "... | \n", "False | \n", "False | \n", "True | \n", "0 | \n", "
193 | \n", "False | \n", "False | \n", "False | \n", "False | \n", "... | \n", "False | \n", "False | \n", "True | \n", "1 | \n", "
4 rows × 16 columns
\n", "