import numpy as np
import pandas as pd
import re, string
import math

import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
from collections import Counter
from collections import OrderedDict 

import nltk
nltk.download('stopwords')
#nltk.download()
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup

from nltk.stem.snowball import SnowballStemmer

from sklearn.utils import shuffle
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout, Flatten
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler, ReduceLROnPlateau

#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.simplefilter(action='ignore', category=Warning)

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


fake_news_data = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
# Change import paths based on directory


real_news_data = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
# Change import paths based on directory


real_news_data.columns == fake_news_data.columns

array([ True,  True,  True,  True])


fake_news_data["target"] = "fake"


real_news_data["target"] = "real"


print("Our combined dataset should have %d rows." %(fake_news_data.shape[0] + real_news_data.shape[0]))

Our combined dataset should have 44898 rows.


news_df = real_news_data.append(fake_news_data).reset_index(drop=True)


assert news_df.shape[0] == 44898


news_df.sample(10)


news_df.columns

Index(['title', 'text', 'subject', 'date', 'target'], dtype='object')


news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   target   44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


news_df.describe()


plt.figure(figsize=(8, 8))
sns.countplot(x="target", data=news_df)
plt.title("Total Counts of Fake or Real News")
plt.xlabel("Target Real/Fake News Column")
plt.ylabel("News Counts")
plt.show()


news_df.subject.unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)


plt.figure(figsize=(15, 8))
sns.countplot(x="subject", data=news_df)
plt.title("Different types of news subjects.")
plt.xlabel("Subjects")
plt.ylabel("News Counts")
plt.show()


temp_df = news_df.groupby(['subject', 'target']).size().reset_index().pivot(columns='target', index='subject', values=0).fillna(0)
temp_df.plot(kind='bar', stacked=True, figsize=(12,6))
plt.title("Different types of news subjects.")
plt.xlabel("Subjects")
plt.ylabel("News Counts Fake/Real Distribution")
plt.show()


news_df[news_df.date.str.len() > 19]


news_df.drop(news_df[news_df.date.str.len() > 19].index, inplace = True)


news_df['date'] = pd.to_datetime(news_df['date'])


daily_aggregated_counts = news_df.groupby(["date", 'target']).size().reset_index().sort_values(["date"])
daily_aggregated_counts.columns = ["date", "target", "count"]


daily_aggregated_counts.head(20)


_temp_daily_counts = daily_aggregated_counts[(daily_aggregated_counts.date.dt.year != 2018) & 
                  (daily_aggregated_counts.date.dt.year != 2015)]


plt.figure(figsize=(15, 8))
sns.lineplot(x="date", y="count", data=_temp_daily_counts, hue="target")
plt.title("Daily time series analysis of fake/real news.")
plt.xlabel("Date")
plt.ylabel("Daily News Counts (Fake/Real)")
plt.show()


monthly_aggregated_counts = daily_aggregated_counts.groupby([daily_aggregated_counts.date.dt.to_period('M') , 'target']).sum().reset_index().sort_values(["date"])
monthly_aggregated_counts.columns = ["date", "target", "count"]


_temp_monthly_counts = monthly_aggregated_counts[(monthly_aggregated_counts.date.dt.year != 2018) & 
                  (monthly_aggregated_counts.date.dt.year != 2015)]

_temp_monthly_counts.loc[:, 'date'] = _temp_monthly_counts.date.values.astype('datetime64[M]')

plt.figure(figsize=(15, 8))
sns.lineplot(x="date", y="count", data=_temp_monthly_counts, hue="target")
plt.title("Monthly time series analysis of fake/real news.")
plt.xlabel("Month/Year")
plt.ylabel("Monthly News Counts (Fake/Real)")
plt.show()


_monthly_subject_counts = news_df.groupby([news_df.date.dt.to_period('M'), 'subject', 'target']).size().reset_index().sort_values(["date"])
_monthly_subject_counts.columns = ["date", "subject", "target", "count"]

_monthly_subject_counts.loc[:, 'date'] = _monthly_subject_counts.date.values.astype('datetime64[M]')


plt.figure(figsize=(15, 8))
sns.lineplot(x="date", y="count", data=_monthly_subject_counts, hue="subject")
plt.title("Monthly time series analysis of (fake & real) news based on subjects.")
plt.xlabel("Year-Month")
plt.ylabel("Monthly News Counts Based on Subjects")
plt.show()



_fake_subject = _monthly_subject_counts[_monthly_subject_counts.target=="fake"]

plt.figure(figsize=(15, 8))
sns.lineplot(x="date", y="count", data=_fake_subject, hue="subject")
plt.title("Monthly time series analysis of FAKE news based on subjects.")
plt.xlabel("Year-Month")
plt.ylabel("Monthly Fake News Counts Based on Subjects")
plt.show()



_real_subject = _monthly_subject_counts[_monthly_subject_counts.target=="real"]

plt.figure(figsize=(15, 8))
sns.lineplot(x="date", y="count", data=_real_subject, hue="subject")
plt.title("Monthly time series analysis of REAL news based on subjects.")
plt.xlabel("Year-Month")
plt.ylabel("Monthly Real News Counts Based on Subjects")
plt.show()


news_df[news_df.target == "real"].text[1]

'WASHINGTON (Reuters) - Transgender people will be allowed for the first time to enlist in the U.S. military starting on Monday as ordered by federal courts, the Pentagon said on Friday, after President Donald Trump’s administration decided not to appeal rulings that blocked his transgender ban. Two federal appeals courts, one in Washington and one in Virginia, last week rejected the administration’s request to put on hold orders by lower court judges requiring the military to begin accepting transgender recruits on Jan. 1. A Justice Department official said the administration will not challenge those rulings. “The Department of Defense has announced that it will be releasing an independent study of these issues in the coming weeks. So rather than litigate this interim appeal before that occurs, the administration has decided to wait for DOD’s study and will continue to defend the president’s lawful authority in District Court in the meantime,” the official said, speaking on condition of anonymity. In September, the Pentagon said it had created a panel of senior officials to study how to implement a directive by Trump to prohibit transgender individuals from serving. The Defense Department has until Feb. 21 to submit a plan to Trump. Lawyers representing currently-serving transgender service members and aspiring recruits said they had expected the administration to appeal the rulings to the conservative-majority Supreme Court, but were hoping that would not happen. Pentagon spokeswoman Heather Babb said in a statement: “As mandated by court order, the Department of Defense is prepared to begin accessing transgender applicants for military service Jan. 1. All applicants must meet all accession standards.” Jennifer Levi, a lawyer with gay, lesbian and transgender advocacy group GLAD, called the decision not to appeal “great news.” “I’m hoping it means the government has come to see that there is no way to justify a ban and that it’s not good for the military or our country,” Levi said. Both GLAD and the American Civil Liberties Union represent plaintiffs in the lawsuits filed against the administration. In a move that appealed to his hard-line conservative supporters, Trump announced in July that he would prohibit transgender people from serving in the military, reversing Democratic President Barack Obama’s policy of accepting them. Trump said on Twitter at the time that the military “cannot be burdened with the tremendous medical costs and disruption that transgender in the military would entail.” Four federal judges - in Baltimore, Washington, D.C., Seattle and Riverside, California - have issued rulings blocking Trump’s ban while legal challenges to the Republican president’s policy proceed. The judges said the ban would likely violate the right under the U.S. Constitution to equal protection under the law. The Pentagon on Dec. 8 issued guidelines to recruitment personnel in order to enlist transgender applicants by Jan. 1. The memo outlined medical requirements and specified how the applicants’ sex would be identified and even which undergarments they would wear. The Trump administration previously said in legal papers that the armed forces were not prepared to train thousands of personnel on the medical standards needed to process transgender applicants and might have to accept “some individuals who are not medically fit for service.” The Obama administration had set a deadline of July 1, 2017, to begin accepting transgender recruits. But Trump’s defense secretary, James Mattis, postponed that date to Jan. 1, 2018, which the president’s ban then put off indefinitely. Trump has taken other steps aimed at rolling back transgender rights. In October, his administration said a federal law banning gender-based workplace discrimination does not protect transgender employees, reversing another Obama-era position. In February, Trump rescinded guidance issued by the Obama administration saying that public schools should allow transgender students to use the restroom that corresponds to their gender identity. '


news_df[news_df.target == "fake"].text[21417]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t even allow him to rise above the gutter long enough to wish the American citizens a happy new year!  Bishop Talbert Swan (@TalbertSwan) December 31, 2017no one likes you  Calvin (@calvinstowell) December 31, 2017Your impeachment would make 2018 a great year for America, but I ll also accept regaining control of Congress.  Miranda Yaver (@mirandayaver) December 31, 2017Do you hear yourself talk? When you have to include that many people that hate you you have to wonder? Why do the they all hate me?  Alan Sandoval (@AlanSandoval13) December 31, 2017Who uses the word Haters in a New Years wish??  Marlene (@marlene399) December 31, 2017You can t just say happy new year?  Koren pollitt (@Korencarpenter) December 31, 2017Here s Trump s New Year s Eve tweet from 2016.Happy New Year to all, including to my many enemies and those who have fought me and lost so badly they just don t know what to do. Love!  Donald J. Trump (@realDonaldTrump) December 31, 2016This is nothing new for Trump. He s been doing this for years.Trump has directed messages to his  enemies  and  haters  for New Year s, Easter, Thanksgiving, and the anniversary of 9/11. pic.twitter.com/4FPAe2KypA  Daniel Dale (@ddale8) December 31, 2017Trump s holiday tweets are clearly not presidential.How long did he work at Hallmark before becoming President?  Steven Goodine (@SGoodine) December 31, 2017He s always been like this . . . the only difference is that in the last few years, his filter has been breaking down.  Roy Schulze (@thbthttt) December 31, 2017Who, apart from a teenager uses the term haters?  Wendy (@WendyWhistles) December 31, 2017he s a fucking 5 year old  Who Knows (@rainyday80) December 31, 2017So, to all the people who voted for this a hole thinking he would change once he got into power, you were wrong! 70-year-old men don t change and now he s a year older.Photo by Andrew Burton/Getty Images.'


def countTwitterUsers(row):
    _user_handles_regex = re.compile(r'@([A-Za-z0-9_]+)')
    return len(re.findall(_user_handles_regex, row))


news_df.loc[:, "total_twitter_handles"] = news_df.apply(lambda x: countTwitterUsers(x["text"]), axis=1)


plt.figure(figsize=(15, 8))
ax = sns.boxplot(x="target", y="total_twitter_handles",
                 data=news_df, palette="Set3"
                )
plt.show()


print("Statistics on number of twitter handles in fake news")
news_df[news_df.target == "fake"].total_twitter_handles.describe()

Statistics on number of twitter handles in fake news

count    23471.000000
mean         1.188104
std          4.145195
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max         88.000000
Name: total_twitter_handles, dtype: float64


print("Statistics on number of twitter handles in real news")
news_df[news_df.target == "real"].total_twitter_handles.describe()

Statistics on number of twitter handles in real news

count    21417.000000
mean         0.037494
std          0.423052
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         19.000000
Name: total_twitter_handles, dtype: float64


def countUniqueWords(row, col_name):
    return len(set(row[col_name].strip(' ').split()))


def countWords(row, col_name):
    return len(row[col_name].strip(' ').split())


stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)


#words = set(nltk.corpus.words.words())


def cleanData(row, column_name):
    text = row[column_name]
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub('\[[^]]*\]', '', text)
    text = re.sub(r'http\S+', '', text)
    text = [word.strip().lower() for word in text.split() if word.strip().lower() not in stop_words 
            and word.strip().lower().isalpha()]
    #text = [word for word in text if word in words]
    text = " ".join(text)
    text = re.sub(r'[?|$|.|!,()]',r'',text)
    return text


def lemmatizeNewsText(row, column):
    wordnet_lemmatizer = WordNetLemmatizer()
    token_words = word_tokenize(row[column])
    lemmatize_text = " ".join([wordnet_lemmatizer.lemmatize(word) for word in token_words])
    return lemmatize_text


#wordnet_lemmatizer = WordNetLemmatizer()
#wordnet_lemmatizer.lemmatize("have")


news_df.loc[:, "clean_data"] = news_df.apply(lambda x: cleanData(x, "text"), axis=1)
news_df.loc[:, "lemmatize_data"] = news_df.apply(lambda x: lemmatizeNewsText(x, "clean_data"), axis=1)
news_df.loc[:, "unique_words"] = news_df.apply(lambda x: countUniqueWords(x, "lemmatize_data"), axis=1)
news_df.loc[:, "total_words"] = news_df.apply(lambda x: countWords(x, "lemmatize_data"), axis=1)


fig, axs = plt.subplots(ncols=2, figsize=(15,8))
ax1 = sns.violinplot(x="target", y="unique_words",
                 data=news_df, palette="Set3",
                ax=axs[0])
ax2 = sns.violinplot(x="target", y="total_words",
                 data=news_df, palette="Set3",
                ax=axs[1])

ax1.title.set_text('Unique Words Violin Plots for Fake and Real News Text')
ax2.title.set_text('Total Words Viloin Plots for Fake and Real News Text')

plt.show()


print("Statistics on number of total words used in fake news")
news_df[news_df.target == "fake"].total_words.describe()

Statistics on number of total words used in fake news

count    23471.000000
mean       186.917345
std        186.872751
min          0.000000
25%        107.000000
50%        159.000000
75%        221.000000
max       3993.000000
Name: total_words, dtype: float64


print("Statistics on number of total words used in real news")
news_df[news_df.target == "real"].total_words.describe()

Statistics on number of total words used in real news

count    21417.000000
mean       182.789887
std        127.146998
min          0.000000
25%         71.000000
50%        170.000000
75%        248.000000
max       1806.000000
Name: total_words, dtype: float64


news_df = news_df.drop((news_df[news_df.total_words == 0].index) | (news_df[news_df.unique_words == 0].index)).reset_index(drop=True)


plt.figure(figsize = (20,20)) 

wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , 
               stopwords = STOPWORDS).generate(" ".join(news_df[news_df.target == "real"].lemmatize_data))
plt.imshow(wc , interpolation = 'bilinear')
plt.title("Real News Word Cloud")
plt.show()


plt.figure(figsize = (20,20)) 

wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , 
               stopwords = STOPWORDS).generate(" ".join(news_df[news_df.target == "fake"].lemmatize_data))
plt.imshow(wc , interpolation = 'bilinear')
plt.title("Fake News Word Cloud")
plt.show()


import itertools

def getWordCorpus(df, column):
    words = list(itertools.chain.from_iterable([sentence.split() for sentence in df[column]]))
    return words


real_corpus = getWordCorpus(news_df[news_df.target == "real"], "lemmatize_data")
fake_corpus = getWordCorpus(news_df[news_df.target == "fake"], "lemmatize_data")


len(real_corpus)
len(fake_corpus)

4387137


from collections import Counter

real_counter = Counter(real_corpus)
real_most_common = real_counter.most_common(20)
real_most_common = dict(real_most_common)

plt.figure(figsize = (20,8)) 
sns.barplot(x=list(real_most_common.keys()), y=list(real_most_common.values()))
plt.title("Real News Most Common Words")
plt.show()

fake_counter = Counter(fake_corpus)
fake_most_common = fake_counter.most_common(20)
fake_most_common = dict(fake_most_common)

plt.figure(figsize = (20,8)) 
sns.barplot(x=list(fake_most_common.keys()), y=list(fake_most_common.values()))
plt.title("Fake News Most Common Words")
plt.show()


news_df["target"] = news_df["target"].astype('category')
news_df["target_category"] = news_df["target"].cat.codes


vectorizer = CountVectorizer()
_vec = vectorizer.fit_transform(news_df['lemmatize_data'])
_real_chi2_scores = chi2(_vec, news_df.target_category)[0]


chi2_scores_features = dict(zip(vectorizer.get_feature_names(), _real_chi2_scores))


sorted_chi2_scores = {k: v for k, v in sorted(chi2_scores_features.items(), key=lambda item: item[1], reverse=True)}


plt.figure(figsize = (20,8)) 
sns.barplot(x=list(sorted_chi2_scores.keys())[:20], y=list(sorted_chi2_scores.values())[:20])
plt.title("Real News Chi-Squared Test Scores")
plt.show()


def topics(model, feature_names, no_top_words):
    _dict = {}
    for topic_idx, topic in enumerate(model.components_):
        _dict[topic_idx] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return _dict


lda = LatentDirichletAllocation(random_state=42).fit(_vec)
topic_all = topics(lda, vectorizer.get_feature_names(), 15)


vectorizer_fake = CountVectorizer()
vectorizer_true = CountVectorizer()

_fake_df = vectorizer_fake.fit_transform(news_df[news_df.target == "fake"].lemmatize_data)
_real_df = vectorizer_true.fit_transform(news_df[news_df.target == "real"].lemmatize_data)

lda_fake = LatentDirichletAllocation(random_state=42, n_components=5).fit(_fake_df)
lda_real = LatentDirichletAllocation(random_state=42, n_components=5).fit(_real_df)

topic_real = topics(lda_real, vectorizer_true.get_feature_names(), 15)
topic_fake = topics(lda_fake, vectorizer_fake.get_feature_names(), 15)


def plot_clouds(_dict, title):
    for topic, words in zip(list(_dict.keys())[:4], list(_dict.values())[:4]):
        cloud = " ".join(words)
        wordcloud = WordCloud(width = 800, height = 800, 
                        background_color ='white',  
                        min_font_size = 10).generate(cloud) 
        plt.figure(figsize = (4, 8), facecolor = None) 
        plt.imshow(wordcloud) 
        plt.axis("off") 
        plt.tight_layout(pad = 0) 
        plt.title(title + ' Topics '+ str(topic+1))
        plt.show()


plot_clouds(topic_fake, 'Fake news Topics')


plot_clouds(topic_real, 'Real news Topics')


def generateNgramsText(text, n):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    tokens = [token for token in text.split(" ") if token != ""]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


def generateTopNgrams(corpus, column_name, n, top_n):
    all_n_grams = []
    for text in corpus[column_name]:
        all_n_grams.extend(generateNgramsText(text, n))
        
    sort_orders = sorted(OrderedDict(Counter(all_n_grams)).items(), key=lambda x: x[1], reverse=True)
    result_dict = {item[0]: item[1] for item in sort_orders[: top_n]}
    return result_dict


top_10_unigram = generateTopNgrams(news_df[news_df.target == "fake"], "lemmatize_data", 1, 10)

plt.figure(figsize = (20,8)) 
sns.barplot(x= list(top_10_unigram.values()), y=list(top_10_unigram.keys()))
plt.title("Top 10 Fake News Unigram Word Analysis")
plt.show()


top_10_bigrams = generateTopNgrams(news_df[news_df.target == "fake"], "lemmatize_data", 2, 10)

plt.figure(figsize = (20,8)) 
sns.barplot(x=list(top_10_bigrams.values()), y= list(top_10_bigrams.keys()))
plt.title("Top  10 Bigrams Fake Analysis")
plt.show()


top_10_trigrams = generateTopNgrams(news_df[news_df.target == "fake"], "lemmatize_data", 3, 10)

plt.figure(figsize = (25,8)) 
sns.barplot(x=list(top_10_trigrams.values()), y=list(top_10_trigrams.keys()))
plt.title("Top 10 Trigrams Fake Analysis")
plt.show()


top_10_unigram = generateTopNgrams(news_df[news_df.target == "real"], "lemmatize_data", 1, 10)

plt.figure(figsize = (20,8)) 
sns.barplot(x= list(top_10_unigram.values()), y=list(top_10_unigram.keys()))
plt.title("Top 10 Real Unigram Word Analysis")
plt.show()


top_10_bigrams = generateTopNgrams(news_df[news_df.target == "real"], "lemmatize_data", 2, 10)

plt.figure(figsize = (20,8)) 
sns.barplot(x=list(top_10_bigrams.values()), y= list(top_10_bigrams.keys()))
plt.title("Top 10 Real Bigrams  Analysis")
plt.show()


top_10_trigrams = generateTopNgrams(news_df[news_df.target == "real"], "lemmatize_data", 3, 10)
plt.figure(figsize = (25,8)) 
sns.barplot(x=list(top_10_trigrams.values()), y=list(top_10_trigrams.keys()))
plt.title("Top 10 Real Trigrams Analysis")
plt.show()


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


X = news_df.lemmatize_data
y = news_df.target_category


X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size=0.2, random_state=7)


tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer.transform(X_test)


def passiveAggressive(X_train, y_train):
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train,y_train)
    return pac


pac = passiveAggressive(X_train, y_train)


def selectKLogisticRegression(X, y):
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
    X_tfidf = tfidf_vectorizer.fit_transform(X)

    accuracy_scores_k = {}
    
    for i in range(1, 11):
        select_percent_features = X_tfidf.shape[1] * 0.1 * i
        sel_chi2 = SelectKBest(chi2, k=int(select_percent_features))

        X_chi2 = sel_chi2.fit_transform(X_tfidf, y)

        X_train, X_test, y_train, y_test = train_test_split(X_chi2, y, 
                                                    test_size=0.2, random_state=7)

        clf = LogisticRegression()
        model = clf.fit(X_train, y_train)
        predict = model.predict(X_test)
        score = accuracy_score(y_test, predict)
        accuracy_scores_k[str(i)] = score
        
    return accuracy_scores_k


embeddings_index = {}
with open("../input/gloveicg/Glove/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_index[word] = vector
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


max_features = 1000
max_length = 300


t = Tokenizer(num_words=max_features)
t.fit_on_texts(X_train)


vocab_size = len(t.word_index) + 1
train_encoded_docs = t.texts_to_sequences(X_train)
train_padded_docs = pad_sequences(train_encoded_docs, maxlen=max_length, padding='post')


test_encoded_docs = t.texts_to_sequences(X_test)
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_length, padding='post')


vocab_size

69593


embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False)


import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau


model = Sequential()
model.add(e)
model.add(LSTM(units=128 , return_sequences = True, recurrent_dropout = 0.2 , dropout = 0.2))
model.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(lr = 0.01), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, 300, 300)          20877900  
_________________________________________________________________
lstm_6 (LSTM)                (None, 300, 128)          219648    
_________________________________________________________________
lstm_7 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_6 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
=================================================================
Total params: 21,149,069
Trainable params: 271,169
Non-trainable params: 20,877,900
_________________________________________________________________


"""def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 5.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

callback_list = [LearningRateScheduler(step_decay)]"""


learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', patience = 2, verbose=1, factor=0.5, min_lr=0.00001)
callback_list = [learning_rate_reduction]


history = model.fit(train_padded_docs, y_train, batch_size = 256, validation_split=0.25, 
                    epochs = 10, callbacks=callback_list, shuffle=True)

Epoch 1/10
104/104 [==============================] - 243s 2s/step - loss: 0.6057 - accuracy: 0.6504 - val_loss: 0.6932 - val_accuracy: 0.5140
Epoch 2/10
104/104 [==============================] - 240s 2s/step - loss: 0.6923 - accuracy: 0.5219 - val_loss: 0.6884 - val_accuracy: 0.5690
Epoch 3/10
104/104 [==============================] - 240s 2s/step - loss: 0.6391 - accuracy: 0.5971 - val_loss: 0.2519 - val_accuracy: 0.9110
Epoch 4/10
104/104 [==============================] - 240s 2s/step - loss: 0.1900 - accuracy: 0.9351 - val_loss: 0.1261 - val_accuracy: 0.9554
Epoch 5/10
104/104 [==============================] - 243s 2s/step - loss: 0.1263 - accuracy: 0.9569 - val_loss: 0.1212 - val_accuracy: 0.9586
Epoch 6/10
104/104 [==============================] - 246s 2s/step - loss: 0.1002 - accuracy: 0.9663 - val_loss: 0.0846 - val_accuracy: 0.9709
Epoch 7/10
104/104 [==============================] - 246s 2s/step - loss: 0.0958 - accuracy: 0.9685 - val_loss: 0.0957 - val_accuracy: 0.9709
Epoch 8/10
104/104 [==============================] - 251s 2s/step - loss: 0.0774 - accuracy: 0.9743 - val_loss: 0.0848 - val_accuracy: 0.9723
Epoch 9/10
104/104 [==============================] - 265s 3s/step - loss: 0.0622 - accuracy: 0.9787 - val_loss: 0.0842 - val_accuracy: 0.9706
Epoch 10/10
104/104 [==============================] - 265s 3s/step - loss: 0.0635 - accuracy: 0.9786 - val_loss: 0.0807 - val_accuracy: 0.9769


y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')


k_scores = selectKLogisticRegression(X, y)
plt.figure(figsize = (25,10)) 
sns.lineplot(x=list(k_scores.keys()), y=list(k_scores.values()))
plt.show()


tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
X_tfidf = tfidf_vectorizer.fit_transform(X)
sel_chi2 = SelectKBest(chi2, k=500)
X_chi2 = sel_chi2.fit_transform(X_tfidf, y)

X_train, X_test, y_train, y_test = train_test_split(X_chi2, y, 
                                            test_size=0.2, random_state=7)

clf = LogisticRegression()
model = clf.fit(X_train, y_train)
predict = model.predict(X_test)
score = accuracy_score(y_test, predict)


score


confusion_matrix(y_test,predict)


print("We can predict with accuracy %s using only %s features. We are using only %s percent of features. "
      %(score*100, max_features, round(max_features/(len(real_corpus) + len(fake_corpus))*100, 4)))


model.evaluate(test_padded_docs, y_test)

277/277 [==============================] - 44s 158ms/step - loss: 0.0710 - accuracy: 0.9780

[0.0710366815328598, 0.9780368804931641]

	title	text	subject	date	target
13799	Vietnam court jails blogger for seven years fo...	(Reuters) - A court in Vietnam jailed a blogge...	worldnews	November 27, 2017	real
9745	Republicans, Obama immigration chief clash ove...	WASHINGTON (Reuters) - The chief of U.S. Immig...	politicsNews	April 28, 2016	real
3239	U.S. federal government can pay bills through ...	WASHINGTON (Reuters) - The U.S. federal govern...	politicsNews	June 12, 2017	real
29759	Donald Trump Makes Pathetic, Racist Swipe At ...	President Obama s speech from a mosque in Balt...	News	February 4, 2016	fake
25048	Trump SURRENDERS, Admits His Business Is A Co...	Donald Trump is finally dealing with the massi...	News	November 30, 2016	fake
364	Kremlin says Putin not influenced by ex-Trump ...	MOSCOW (Reuters) - The Kremlin said on Monday ...	politicsNews	December 4, 2017	real
18805	U.N. agrees new team of experts for Burundi bu...	GENEVA (Reuters) - The U.N. Human Rights Counc...	worldnews	September 28, 2017	real
25712	Alec Baldwin NAILS Trump’s ‘Grab Them by the ...	Who didn t miss Saturday Night Live s take on ...	News	October 9, 2016	fake
9929	John Kerry: Carnival should not bar Cuban-Amer...	MIAMI (Reuters) - U.S. Secretary of State John...	politicsNews	April 14, 2016	real
28620	Trump’s Pick For Supreme Court Justice Shows ...	The GOP has been absolutely obsessed with bloc...	News	March 30, 2016	fake

	title	text	subject	date	target
30775	https://100percentfedup.com/served-roy-moore-v...	https://100percentfedup.com/served-roy-moore-v...	politics	https://100percentfedup.com/served-roy-moore-v...	fake
36924	https://100percentfedup.com/video-hillary-aske...	https://100percentfedup.com/video-hillary-aske...	politics	https://100percentfedup.com/video-hillary-aske...	fake
36925	https://100percentfedup.com/12-yr-old-black-co...	https://100percentfedup.com/12-yr-old-black-co...	politics	https://100percentfedup.com/12-yr-old-black-co...	fake
37256	https://fedup.wpengine.com/wp-content/uploads/...	https://fedup.wpengine.com/wp-content/uploads/...	politics	https://fedup.wpengine.com/wp-content/uploads/...	fake
37257	https://fedup.wpengine.com/wp-content/uploads/...	https://fedup.wpengine.com/wp-content/uploads/...	politics	https://fedup.wpengine.com/wp-content/uploads/...	fake
38849	https://fedup.wpengine.com/wp-content/uploads/...	https://fedup.wpengine.com/wp-content/uploads/...	Government News	https://fedup.wpengine.com/wp-content/uploads/...	fake
38850	https://fedup.wpengine.com/wp-content/uploads/...	https://fedup.wpengine.com/wp-content/uploads/...	Government News	https://fedup.wpengine.com/wp-content/uploads/...	fake
40350	Homepage	[vc_row][vc_column width= 1/1 ][td_block_trend...	left-news	MSNBC HOST Rudely Assumes Steel Worker Would N...	fake
43286	https://fedup.wpengine.com/wp-content/uploads/...	https://fedup.wpengine.com/wp-content/uploads/...	left-news	https://fedup.wpengine.com/wp-content/uploads/...	fake
43287	https://fedup.wpengine.com/wp-content/uploads/...	https://fedup.wpengine.com/wp-content/uploads/...	left-news	https://fedup.wpengine.com/wp-content/uploads/...	fake

Fake and Real News Classification¶

Importing Libraries¶

Importing Data¶

Data Analysis¶

target¶

subject¶

date¶

Topic Modelling¶

Fake News n-gram Analysis¶

Real News n-gram Analysis¶

Feature Engineering¶

Model Building¶

Passive Agressive Algorithm¶

Logistic Regression Algorithm¶

Deep Learning Model¶

Model Evaluation¶

Evaluate deep learning model¶

Conclusion¶

	title	text	subject	date	target
count	44898	44898	44898	44898	44898
unique	38729	38646	8	2397	2
top	Factbox: Trump fills top jobs for his administ...		politicsNews	December 20, 2017	fake
freq	14	627	11272	182	23481

	date	target	count
0	2015-03-31	fake	8
1	2015-04-01	fake	2
2	2015-04-02	fake	1
3	2015-04-04	fake	5
4	2015-04-05	fake	13
5	2015-04-06	fake	24
6	2015-04-07	fake	16
7	2015-04-08	fake	10
8	2015-04-09	fake	22
9	2015-04-10	fake	11
10	2015-04-11	fake	6
11	2015-04-12	fake	14
12	2015-04-13	fake	16
13	2015-04-14	fake	25
14	2015-04-15	fake	6
15	2015-04-16	fake	12
16	2015-04-17	fake	12
17	2015-04-18	fake	2
18	2015-04-19	fake	6
19	2015-04-20	fake	10