{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Análisis de sentimiento"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "El análisis de sentimiento es una técnica de procesamiento de lenguaje natural (NLP) que permite determinar la actitud de un autor respecto a un tema o la polaridad de una opinión."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import re\n",
    "\n",
    "from sentiment_analysis_spanish import sentiment_analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tweet</th>\n",
       "      <th>hashtags</th>\n",
       "      <th>mentions</th>\n",
       "      <th>links</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13099</th>\n",
       "      <td>El LG_ES G3 S ya a la venta por 299 euros -</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32607</th>\n",
       "      <td>No habrá ningún desahucio más en Madrid sin al...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69261</th>\n",
       "      <td>El partido socialista derogará la reforma labo...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59548</th>\n",
       "      <td>.mdcospedal inicia la campaña electoral del PP...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21659</th>\n",
       "      <td>BuenosDíasUAM!!! \"La clave de la educación no ...</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3344</th>\n",
       "      <td>Nada más lindo q estar en casa :)</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64788</th>\n",
       "      <td>|| Yo no me entero de que ha pasado. :D</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69124</th>\n",
       "      <td>Napolitanas con fritas para el bajon :P</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34177</th>\n",
       "      <td>Buenas noches. 💜</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2680</th>\n",
       "      <td>A las 12:30 conferencia de marianorajoy en el ...</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7019 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   tweet  hashtags  mentions  \\\n",
       "13099      El LG_ES G3 S ya a la venta por 299 euros -           0         1   \n",
       "32607  No habrá ningún desahucio más en Madrid sin al...         0         0   \n",
       "69261  El partido socialista derogará la reforma labo...         1         1   \n",
       "59548  .mdcospedal inicia la campaña electoral del PP...         0         1   \n",
       "21659  BuenosDíasUAM!!! \"La clave de la educación no ...         3         0   \n",
       "...                                                  ...       ...       ...   \n",
       "3344                   Nada más lindo q estar en casa :)         0         0   \n",
       "64788            || Yo no me entero de que ha pasado. :D         0         0   \n",
       "69124            Napolitanas con fritas para el bajon :P         0         0   \n",
       "34177                                  Buenas noches. 💜          0         0   \n",
       "2680   A las 12:30 conferencia de marianorajoy en el ...         1         3   \n",
       "\n",
       "       links  \n",
       "13099      2  \n",
       "32607      2  \n",
       "69261      0  \n",
       "59548      1  \n",
       "21659      0  \n",
       "...      ...  \n",
       "3344       0  \n",
       "64788      0  \n",
       "69124      0  \n",
       "34177      1  \n",
       "2680       1  \n",
       "\n",
       "[7019 rows x 4 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweets = pd.read_csv('https://raw.githubusercontent.com/garnachod/TwitterSentimentDataset/master/tweets_clean.txt', header=None, sep='\\t')\n",
    "tweets = tweets.sample(frac=0.1, random_state=42)\n",
    "tweets.columns = ['tweet']\n",
    "\n",
    "# feature engineering, extract #, @, and links\n",
    "tweets['hashtags'] = tweets['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))\n",
    "tweets['mentions'] = tweets['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))\n",
    "tweets['links'] = tweets['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('http')]))\n",
    "\n",
    "# remove #, @, and links\n",
    "tweets['tweet'] = tweets['tweet'].str.replace('#', '')\n",
    "tweets['tweet'] = tweets['tweet'].str.replace('@', '')\n",
    "tweets['tweet'] = tweets['tweet'].apply(lambda x: re.sub(r'http\\S+|www.\\S+', '', x, flags=re.MULTILINE))\n",
    "\n",
    "tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/alejo/.local/lib/python3.11/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator CountVectorizer from version 0.23.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
      "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
      "  warnings.warn(\n",
      "/home/alejo/.local/lib/python3.11/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator MultinomialNB from version 0.23.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
      "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "sentiment = sentiment_analysis.SentimentAnalysisSpanish()\n",
    "tweets['sentiment'] = tweets['tweet'].apply(lambda x: sentiment.sentiment(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9094362772997505"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentiment.sentiment(\"Me encanta el fútbol\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.623559185538157e-05"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentiment.sentiment(\"El producto, un desastre\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}