In [6]:
import os
import requests
import re
import ast
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import names, stopwords, words
from nltk.tokenize import word_tokenize
from rake_nltk import Rake
import nltk
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

Get Data¶

In [9]:
def get_news(q, df=True, from_date=None, to_date=None, language=None, searchIn=None, page_size=None, page=None):
    endpoint = 'https://newsapi.org/v2/everything'
    params = {
        'apiKey': os.getenv('NEWS_API_KEY'),
        'pageSize': page_size,
        'page': page
    }

    query_params = {
        'q': q,
        'from': from_date,
        'to': to_date,
        'language': language,
        'searchIn': searchIn
    }

    for key, value in query_params.items():
        if value is not None:
            params[key] = value

    response = requests.get(endpoint, params=params)
    response.raise_for_status()
    print(response.url)

    if df:
        return pd.json_normalize(pd.DataFrame(response.json())['articles'])
    else:
        return response.json()
In [11]:
df = get_news(
    q=f"{'water+climate+change+river+drinking'}", 
    df=True, 
    from_date='2024-09-08', 
    to_date='2024-10-08', 
    language='en',
    page_size=100, 
    page=1
)

display(df)
https://newsapi.org/v2/everything?apiKey=935933a6b0ce488f86af5022580c4f62&pageSize=100&page=1&q=water%2Bclimate%2Bchange%2Briver%2Bdrinking&from=2024-09-08&to=2024-10-08&language=en
author title description url urlToImage publishedAt content source.id source.name
0 Lauren Sommer Hurricanes are dangerous far from the coast. C... Coastal cities often bear the brunt of hurrica... https://www.npr.org/2024/10/01/nx-s1-5133530/h... https://npr.brightspotcdn.com/dims3/default/st... 2024-10-02T09:00:00Z Hurricane Helenes destructive path tore across... None NPR
1 None New tool to help decision-makers navigate poss... The Colorado River is a vital source of water ... https://www.sciencedaily.com/releases/2024/09/... https://www.sciencedaily.com/images/scidaily-i... 2024-09-20T20:08:11Z The Colorado River is a vital source of water ... None Science Daily
2 Matthew Carroll New tool to help decision makers navigate poss... The Colorado River is a vital source of water ... https://phys.org/news/2024-09-tool-decision-ma... https://scx2.b-cdn.net/gfx/news/hires/2024/new... 2024-09-21T15:05:35Z The Colorado River is a vital source of water ... None Phys.Org
3 Ariel Wittenberg, E&E News Hurricanes Helene’s Floods Swamped a Hospital,... Hurricane Helene forced dozens of medical faci... https://subscriber.politicopro.com/article/een... https://static.scientificamerican.com/dam/m/78... 2024-10-02T18:45:00Z CLIMATEWIRE | A dramatic helicopter evacuation... None Politicopro.com
4 Al Jazeera Water levels in major Amazon tributary tumble ... Climate change and below-average rainfall have... https://www.aljazeera.com/gallery/2024/10/5/wa... https://www.aljazeera.com/wp-content/uploads/2... 2024-10-05T01:49:16Z Its one of the largest rivers in the world. An... al-jazeera-english Al Jazeera English
... ... ... ... ... ... ... ... ... ...
91 Dariel Pradas A Cuban Town Improves Water Quality Through De... Overnight, hundreds of people in the rural com... https://www.ipsnews.net/2024/09/cuban-town-imp... https://www.ipsnews.net/Library/2024/09/Agua-1... 2024-09-09T15:44:14Z Editors' Choice, Featured, Headlines, Health, ... None Inter Press Service
92 Khabarhub Langtang National Park: Understanding climate ... Langtang National Park — Nepal’s High Mountain... https://english.khabarhub.com/2024/18/397905/ https://english.khabarhub.com/wp-content/uploa... 2024-09-18T06:15:48Z Langtang National Park — Nepal’s High Mountain... None Khabarhub.com
93 Oritro Karim Typhoon Yagi Devastates Southeast Asia In early September, Typhoon Yagi, a deadly tro... https://www.ipsnews.net/2024/09/typhoon-yagi-d... https://www.ipsnews.net/Library/2024/09/The-af... 2024-09-19T09:57:08Z Asia-Pacific, Climate Change, Economy & Tr... None Inter Press Service
94 Dima Al-Khatib A Better Tomorrow with South-South Cooperation The annual United Nations Day for South-South ... https://www.ipsnews.net/2024/09/better-tomorro... https://www.ipsnews.net/Library/2024/09/A-Bett... 2024-09-12T06:17:13Z Civil Society, Development & Aid, Economy ... None Inter Press Service
95 Rina Mukherji Rejuvenating Tradition Help Save Ancient Engin... Dhamapur is a small village in Malvan taluka o... https://www.ipsnews.net/2024/10/rejuvenating-t... https://www.ipsnews.net/Library/2024/10/Dhamap... 2024-10-02T09:50:31Z Arts, Asia-Pacific, Civil Society, Conservatio... None Inter Press Service

96 rows × 9 columns

In [12]:
# Function to parse the content of each article
def parse_article_content(url):
    print(url)
    try:
        response = requests.get(url, timeout=10)  # Set a timeout of 10 seconds
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        return None
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the text from the article
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text() for para in paragraphs])
    words = re.findall(r'\b\w+\b', article_text)
    return words

# Apply the function to each URL in the dataframe
df['parsed_content'] = df['url'].apply(parse_article_content)
https://www.npr.org/2024/10/01/nx-s1-5133530/hurricane-helene-rain-flooding-climate-change
https://www.sciencedaily.com/releases/2024/09/240920160811.htm
403 Client Error: Forbidden for url: https://www.sciencedaily.com/releases/2024/09/240920160811.htm
https://phys.org/news/2024-09-tool-decision-makers-futures-colorado.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-tool-decision-makers-futures-colorado.html
https://subscriber.politicopro.com/article/eenews/2024/10/02/a-dangerous-rescue-helene-made-a-hospital-almost-inescapable-00181734
https://www.aljazeera.com/gallery/2024/10/5/water-levels-in-major-amazon-tributary-tumble-to-record-lows-amid-drought
https://phys.org/news/2024-09-amazon-river-dries-hellish-villagers.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-amazon-river-dries-hellish-villagers.html
https://www.aljazeera.com/news/2024/9/19/storm-boris-floods-northern-italy-as-leaders-to-discuss-eu-aid-in-poland
https://phys.org/news/2024-09-nepal-surveys-wreckage-death-toll.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-nepal-surveys-wreckage-death-toll.html
https://www.nakedcapitalism.com/2024/09/how-cities-run-dry.html
https://phys.org/news/2024-09-city.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-city.html
https://www.poynter.org/reporting-editing/2024/climate-change-stories-are-everywhere-heres-how-to-find-them/
403 Client Error: Forbidden for url: https://www.poynter.org/reporting-editing/2024/climate-change-stories-are-everywhere-heres-how-to-find-them/
https://www.forbes.com/sites/kensilverstein/2024/10/07/heat-and-drought-imperil-the-panama-canal-and-adjoining-rainforests/
https://www.project-syndicate.org/magazine/water-security-crisis-nature-of-the-problem-and-what-to-do-by-mariana-mazzucato-et-al-2024-09
https://www.bbc.com/news/articles/cd6qvpe0dxqo?xtor=AL-72-%5Bpartner%5D-%5Byahoo.north.america%5D-%5Bheadline%5D-%5Bnews%5D-%5Bbizdev%5D-%5Bisapi%5D
http://grist.org/drought/in-arid-new-mexico-a-debate-over-reusing-oil-industry-wastewater/
https://phys.org/news/2024-09-gateway-arctic-fat-ice-polar.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-gateway-arctic-fat-ice-polar.html
https://thediplomat.com/2024/09/death-toll-climbs-to-199-in-vietnam-as-typhoons-aftermath-brings-flash-floods-and-landslides/
https://www.forbes.com/sites/amyfeldman/2024/09/13/a-new-technology-for-getting-rid-of-forever-chemicals-adam-neumann-clean-energy-transition/
https://phys.org/news/2024-09-world-million-tons-plastic-pollution.html
400 Client Error: Bad request for url: https://phys.org/news/2024-09-world-million-tons-plastic-pollution.html
https://www.ibtimes.com/drowned-hurricane-remote-ncarolina-towns-now-struggle-water-3745739
403 Client Error: Forbidden for url: https://www.ibtimes.com/drowned-hurricane-remote-ncarolina-towns-now-struggle-water-3745739
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://www.voanews.com/a/chad-floods-kill-503-affect-1-7-million-people-un-says-/7793700.html
https://www.rawstory.com/nepal-surveys-flood-wreckage-as-toll-reaches-198/
403 Client Error: Forbidden for url: https://www.rawstory.com/nepal-surveys-flood-wreckage-as-toll-reaches-198/
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://www.ibtimes.com/nepal-surveys-flood-wreckage-death-toll-reaches-200-3744951
403 Client Error: Forbidden for url: https://www.ibtimes.com/nepal-surveys-flood-wreckage-death-toll-reaches-200-3744951
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://www.voanews.com/a/vietnam-death-toll-climbs-to-199-as-typhoon-aftermath-brings-flash-floods-landslides/7781246.html
https://www.yahoo.com/news/amazon-river-dries-creating-hellish-195035044.html
https://gcaptain.com/brazils-coastline-faces-accelerated-erosion-as-atlantic-advances/
https://www.bbc.com/news/articles/cd6qvpe0dxqo
https://www.insurancejournal.com/news/southeast/2024/10/04/795811.htm
https://www.irishtimes.com/environment/2024/10/04/new-59m-reservoir-will-enhance-security-and-resilience-of-water-supply-across-greater-dublin-area/
https://abcnews.go.com/International/wireStory/brazils-worst-drought-wildfires-rage-amazon-river-falls-113540209
404 Client Error: Not Found for url: https://abcnews.go.com/International/wireStory/brazils-worst-drought-wildfires-rage-amazon-river-falls-113540209
https://www.independent.ie/irish-news/experts-say-irish-economy-will-lose-billions-to-climate-change-amid-failure-to-prepare/a1807742926.html
https://punchng.com/ngo-seeks-support-for-borno-flood-victims/
https://ca.news.yahoo.com/imperial-oil-could-shutter-norman-001241672.html
https://economictimes.indiatimes.com/news/india/india-eu-agree-to-enhance-cooperation-in-sustainable-water-management/articleshow/113463895.cms
https://www.cbc.ca/news/canada/edmonton/site-c-dam-bc-hydro-alberta-peace-river-1.7322035
HTTPSConnectionPool(host='www.cbc.ca', port=443): Read timed out. (read timeout=10)
https://mymodernmet.com/independent-photographer-travel-photo-awards/
https://www.investing.com/news/world-news/depth-of-major-amazon-tributary-in-brazil-drops-to-record-low-in-severe-drought-3641797
403 Client Error: Forbidden for url: https://www.investing.com/news/world-news/depth-of-major-amazon-tributary-in-brazil-drops-to-record-low-in-severe-drought-3641797
https://www.digitaljournal.com/world/drowned-by-hurricane-remote-n-carolina-towns-now-struggle-for-water/article
403 Client Error: Forbidden for url: https://www.digitaljournal.com/world/drowned-by-hurricane-remote-n-carolina-towns-now-struggle-for-water/article
https://abcnews.go.com/Technology/wireStory/gateway-arctic-fat-ice-polar-bears-crucial-trouble-113957248
https://www.bostonherald.com/2024/09/27/wildfire-replanting-efforts/
https://cleantechnica.com/2024/09/25/risk-fingerprints-gray-rhinos-help-communication-strategy/
403 Client Error: Forbidden for url: https://cleantechnica.com/2024/09/25/risk-fingerprints-gray-rhinos-help-communication-strategy/
https://abcnews.go.com/US/wireStory/after-storms-francine-new-orleans-rushes-dry-113675709
404 Client Error: Not Found for url: https://abcnews.go.com/US/wireStory/after-storms-francine-new-orleans-rushes-dry-113675709
https://punchng.com/mr-president-was-the-borno-flood-a-natural-disaster/
https://www.commondreams.org/opinion/appalachia-climate-crisis
403 Client Error: Forbidden for url: https://www.commondreams.org/opinion/appalachia-climate-crisis
https://thehillstimes.in/international/vietnam-death-toll-climbs-to-199-as-typhoons-aftermath-brings-flash-floods-and-landslides
https://www.abc.net.au/news/2024-09-11/walgett-namoi-river-water-testing-reveals-metals-pesticides/104329362
https://abcnews.go.com/Health/wireStory/amazon-ashaninka-tribe-restored-territory-now-aim-change-113620295
404 Client Error: Not Found for url: https://abcnews.go.com/Health/wireStory/amazon-ashaninka-tribe-restored-territory-now-aim-change-113620295
https://iasexamportal.com/upsc-mains/papers/2024-general-studies-paper-4
https://www.yahoo.com/news/many-forests-fail-recover-wildfires-133105574.html
https://slate.com/life/2024/10/asheville-floods-trees-art-damage-death-dams.html
http://deadline.com/2024/09/micheaux-film-festival-2024-viola-davis-michael-ealy-morris-chestnut-michael-gandolfini-1236090280/
https://www.americanthinker.com/blog/2024/09/indigenous_communities_sound_the_alarm_on_the_environmental_destruction_caused_by_surge_of_illegals_invited_through_biden_harris_open_border.html
https://www.pbs.org/newshour/world/brazil-faces-its-worst-drought-as-wildfires-rage-and-amazon-river-falls-to-record-low
https://www.forbes.com/sites/kailayu/2024/09/30/57-irresistible-pastas-from-restaurants-worldwide-for-pasta-month/
https://thehillstimes.in/international/rescue-operations-continue-as-over-200-people-killed-in-floods-landslides-in-nepal
https://www.business-standard.com/world-news/amid-drought-in-brazil-wildfires-rage-amazon-level-falls-to-record-low-124091000052_1.html
403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/amid-drought-in-brazil-wildfires-rage-amazon-level-falls-to-record-low-124091000052_1.html
https://www.business-standard.com/world-news/vietnam-toll-climbs-to-199-as-yagi-aftermath-brings-flash-flood-landslides-124091200601_1.html
403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/vietnam-toll-climbs-to-199-as-yagi-aftermath-brings-flash-flood-landslides-124091200601_1.html
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://www.ndtv.com/world-news/rescue-operations-continue-as-over-200-people-killed-in-floods-landslides-in-nepal-6683768
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://www.globenewswire.com/news-release/2024/09/24/2952352/0/en/Water-Quality-Instruments-Market-is-Exhibiting-a-CAGR-of-7-10-during-the-forecast-period-Exactitude-Consultancy.html
https://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0012139
https://www.cbc.ca/news/canada/manitoba/trouble-arctic-ecosystem-polar-bears-1.7332545
HTTPSConnectionPool(host='www.cbc.ca', port=443): Read timed out. (read timeout=10)
https://www.globenewswire.com/news-release/2024/10/03/2957680/0/en/Beyond-Green-Welcomes-New-Member-Properties-in-Africa-Europe-and-Latin-America-Thoughtfully-Expanding-its-Global-Footprint.html
https://financialpost.com/pmn/as-many-forests-fail-to-recover-from-wildfires-replanting-efforts-face-huge-odds-and-obstacles
403 Client Error: Forbidden for url: https://financialpost.com/pmn/as-many-forests-fail-to-recover-from-wildfires-replanting-efforts-face-huge-odds-and-obstacles
https://bmjopen.bmj.com/content/14/9/e083624
https://abcnews.go.com/Business/wireStory/forests-fail-recover-wildfires-replanting-efforts-face-huge-114275681
https://www.pbs.org/newshour/world/research-reveals-threats-polar-bears-face-as-climate-change-melts-arctic-ice-hunting-grounds
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://www.theatlantic.com/ideas/archive/2024/09/michael-regan-epa-environmental-justice-lawsuit/679941/
https://www.bangkokpost.com/life/social-and-lifestyle/2866857/navigating-rain-bombs
https://slate.com/technology/2024/10/hurricane-helene-destruction-north-carolina-florida-georgia-climate-change.html
https://richmond.com/news/nation-world/israel-fights-on-two-fronts/article_62edc161-35aa-52fe-b765-1fa0cface714.html
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://removed.com
HTTPSConnectionPool(host='removed.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:1002)')))
https://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0012483
https://roanoke.com/news/nation-world/israel-fights-on-two-fronts/article_fd2c97b3-87b7-5629-beef-cfdb9520d6bf.html
429 Client Error: Too Many Requests for url: https://roanoke.com/news/nation-world/israel-fights-on-two-fronts/article_fd2c97b3-87b7-5629-beef-cfdb9520d6bf.html
https://www.seattlepi.com/news/world/article/amid-the-worst-drought-in-brazil-history-19753496.php
403 Client Error: Forbidden for url: https://www.seattlepi.com/news/world/article/amid-the-worst-drought-in-brazil-history-19753496.php
https://www.seattlepi.com/news/article/ap-photos-hallmarks-of-climate-change-seen-in-19771941.php
403 Client Error: Forbidden for url: https://www.seattlepi.com/news/article/ap-photos-hallmarks-of-climate-change-seen-in-19771941.php
https://www.ibtimes.com.au/nepal-surveys-flood-wreckage-death-toll-reaches-200-1852011
403 Client Error: Forbidden for url: https://www.ibtimes.com.au/nepal-surveys-flood-wreckage-death-toll-reaches-200-1852011
https://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0012508
https://www.resilience.org/stories/2024-09-26/hurricanes-dont-stop-at-the-coast-these-mountain-towns-know-how-severe-inland-flood-damage-can-be-and-theyre-watching-helene/
https://www.seattlepi.com/news/world/article/in-the-amazon-the-ashaninka-tribe-restored-their-19760299.php
403 Client Error: Forbidden for url: https://www.seattlepi.com/news/world/article/in-the-amazon-the-ashaninka-tribe-restored-their-19760299.php
https://nation.africa/kenya/health/report-reveals-why-african-countries-face-persistent-water-woes-4776490
https://www.esquire.com/sports/a62215598/vince-lombardi-toughest-nfl-coach-1968/
https://press.un.org/en/2024/db241007.doc.htm
https://itif.org/publications/2024/09/16/china-is-rapidly-becoming-a-leading-innovator-in-advanced-industries/
 https://en.setopati.com/International/163665 
404 Client Error: Not Found for url: https://en.setopati.com/International/163665%20
https://www.ipsnews.net/2024/09/cuban-town-improves-water-quality-desalination/
https://english.khabarhub.com/2024/18/397905/
https://www.ipsnews.net/2024/09/typhoon-yagi-devastates-southeast-asia/
https://www.ipsnews.net/2024/09/better-tomorrow-south-south-cooperation/
https://www.ipsnews.net/2024/10/rejuvenating-traditions-help-save-ancient-engineering-marvel-dhamapur-lake/
In [6]:
df = df.dropna(subset=['parsed_content'])
df = df.drop_duplicates(subset=['title'])
display(df['parsed_content'])
# Save raw data
df.to_csv('data/water+climate+change+river+drinking_news_202140908_20241008.csv', index=False)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 df = df.dropna(subset=['parsed_content'])
      2 df = df.drop_duplicates(subset=['title'])
      3 display(df['parsed_content'])

NameError: name 'df' is not defined

Clean Data¶

In [7]:
# Download the list of English words and stopwords
nltk.download('words')
nltk.download('stopwords')
english_words = set(nltk.corpus.words.words())
stop_words = set(stopwords.words('english'))

# Load the data
df = pd.read_csv('data/water+climate+change+river+drinking_news_202140908_20241008.csv')

def clean_content(content):
    if isinstance(content, list):
        content = ' '.join(content)
    
    # Convert to lowercase
    content = content.lower()
    
    # Replace 'climate change' with 'climatechange'
    content = content.replace('climate change', 'climatechange')
    
    # Remove numbers and dates
    content = re.sub(r'\d+', '', content)
    content = re.sub(r'\b\d{4}\b', '', content)  # Remove 4-digit years
    
    # Summarize the content
    parser = PlaintextParser.from_string(content, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, 5)  # Summarize to 5 sentences
    content = ' '.join([str(sentence) for sentence in summary])
    
    # Initialize RAKE
    rake = Rake()
    
    # Extract keywords
    rake.extract_keywords_from_text(content)
    keywords = rake.get_ranked_phrases()
    
    # Filter out non-English words and stopwords
    keywords = [word for word in keywords if word in english_words and word not in stop_words]
    
    # Remove duplicate words within a transaction
    keywords = list(set(keywords))
    
    return keywords

# Apply the clean_content function
df['transactions'] = df['parsed_content'].apply(ast.literal_eval)
df['cleaned_content'] = df['transactions'].apply(clean_content)

# Remove duplicates
df = df.drop_duplicates(subset=['cleaned_content'])

# Remove rows with empty cleaned_content
df = df[df['cleaned_content'].apply(lambda x: len(x) > 0)]

# Display the DataFrame
display(df[['parsed_content', 'transactions', 'cleaned_content']])

# Convert series of lists into a format suitable for R's read.transactions
def write_basket_format(df, filename):
    with open(filename, 'w') as f:
        for transaction in df['cleaned_content']:
            f.write(','.join(transaction) + '\n')

# Write to a text file
write_basket_format(df, 'transactions_basket.txt')
[nltk_data] Downloading package words to
[nltk_data]     /Users/garrettflowers/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/garrettflowers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
parsed_content transactions cleaned_content
0 ['Lauren', 'Sommer', 'Extreme', 'rain', 'is', ... [Lauren, Sommer, Extreme, rain, is, becoming, ... [known, works, warning, know, rainfall, floodi...
2 ['In', 'Pictures', 'It', 's', 'one', 'of', 'th... [In, Pictures, It, s, one, of, the, largest, r... [community, also, world, intense, end, country...
3 ['Italy', 's', 'Emilia', 'Romagna', 'region', ... [Italy, s, Emilia, Romagna, region, sees, thre... [aid, suspended, led, frequency, evacuation, c...
4 ['Yves', 'here', 'Yours', 'truly', 'has', 'bee... [Yves, here, Yours, truly, has, been, under, r... [pacific, glass, amount, return, prepared, cha...
5 ['GAMBOA', 'PANAMA', 'SEPTEMBER', '20', 'In', ... [GAMBOA, PANAMA, SEPTEMBER, 20, In, an, aerial... [causing, post, background, resulting, open, a...
6 ['Increasingly', 'severe', 'water', 'shortages... [Increasingly, severe, water, shortages, repre... [also, already, resolved, link, receive, syste...
7 ['Water', 'levels', 'in', 'many', 'of', 'the',... [Water, levels, in, many, of, the, rivers, in,... [anchored, agriculture, role, critical, likely...
8 ['Δ', 'A', 'nonprofit', 'independent', 'media'... [Δ, A, nonprofit, independent, media, organiza... [spread, addition, uranium, interview, return,...
9 ['Read', 'The', 'Diplomat', 'Know', 'The', 'As... [Read, The, Diplomat, Know, The, Asia, Pacific... [damage, goods, response, flooding, shelter, s...
10 ['Plus', 'Adam', 'Neumann', 's', 'climate', 'c... [Plus, Adam, Neumann, s, climate, company, is,... [post, open, end, go, work, event, purpose, ma...
11 ['Print', 'Severe', 'flooding', 'in', 'Chad', ... [Print, Severe, flooding, in, Chad, since, Jul... [summer, chad, slew, trouble, come, capital, d...
12 ['Print', 'The', 'death', 'toll', 'in', 'the',... [Print, The, death, toll, in, the, aftermath, ... [also, hand, aid, strategic, ground, flooding,...
13 ['Only', 'the', 'youngest', 'and', 'strongest'... [Only, the, youngest, and, strongest, villager... [goods, favorite, end, sale, country, rest, go...
14 ['Essential', 'news', 'coupled', 'with', 'the'... [Essential, news, coupled, with, the, finest, ... [rate, ocean, catastrophe, loss, arsenic, subm...
15 ['HENDERSONVILLE', 'N', 'C', 'AP', 'Hurricane'... [HENDERSONVILLE, N, C, AP, Hurricane, Helene, ... [community, reminder, topic, map, friend, floo...
16 ['The', 'completion', 'of', 'a', 'new', '59', ... [The, completion, of, a, new, 59, million, res... [works, landscape, critical, likely, project, ...
17 ['There', 'have', 'been', 'calls', 'for', 'the... [There, have, been, calls, for, the, Governmen... [works, also, spread, damage, spring, calling,...
18 ['Most', 'Widely', 'Read', 'Newspaper', 'An', ... [Most, Widely, Read, Newspaper, An, aerial, vi... [danger, also, solidarity, flooding, earn, cit...
19 ['Imperial', 'Oil', 'and', 'Sahtu', 'leaders',... [Imperial, Oil, and, Sahtu, leaders, are, lock... [review, likely, end, operation, many, go, lan...
20 ['Artificial', 'Intelligence', 'AI', 'AI', 'fo... [Artificial, Intelligence, AI, AI, for, Everyo... [interested, web, experience, money, finance, ...
21 ['Supplies', 'for', 'Phugtal', 'Monastery', 'b... [Supplies, for, Phugtal, Monastery, by, Andrew... [end, country, work, food, search, variety, sy...
22 ['In', 'the', 'gateway', 'to', 'the', 'Arctic'... [In, the, gateway, to, the, Arctic, fat, and, ... [policy, amount, battle, staple, end, rest, bo...
23 ['E', 'Edition', 'Sign', 'up', 'for', 'email',... [E, Edition, Sign, up, for, email, newsletters... [forest, open, likely, go, work, private, righ...
24 ['Most', 'Widely', 'Read', 'Newspaper', 'Steph... [Most, Widely, Read, Newspaper, Stephen, Angbu... [evening, grant, open, pray, hear, country, mi...
25 ['08', 'October', '2024', 'ePaper', '07', 'Oct... [08, October, 2024, ePaper, 07, October, 2024,... [damage, know, goods, flooding, shelter, sapa,...
26 ['Search', 'the', 'news', 'stories', 'people',... [Search, the, news, stories, people, Personali... [fact, work, familiar, provide, search, found,...
27 ['HOT', 'UPSC', 'IAS', 'ONLINE', 'COURSE', 'NE... [HOT, UPSC, IAS, ONLINE, COURSE, NEW, UPSC, MA... [procurement, spread, resolved, plunging, disc...
28 ['BELLVUE', 'Colo', 'AP', 'Camille', 'Stevens'... [BELLVUE, Colo, AP, Camille, Stevens, Rumann, ... [forest, open, likely, end, sale, country, go,...
29 ['In', 'What', 'It', 's', 'Like', 'people', 't... [In, What, It, s, Like, people, tell, us, well... [evening, based, amount, interview, hear, prep...
30 ['By', 'Zac', 'Ntim', 'International', 'Report... [By, Zac, Ntim, International, Reporter, EXCLU... [return, prep, appearance, fourth, make, wound...
31 ['Author', 's', 'note', 'Immigrant', 'implies'... [Author, s, note, Immigrant, implies, a, forei... [guardian, work, stretch, provide, make, found...
32 ['Fabiano', 'Maisonnave', 'Associated', 'Press... [Fabiano, Maisonnave, Associated, Press, Fabia... [landscape, world, intense, country, burned, n...
33 ['Valentine', 'PHX', 's', 'Elote', 'Pasta', 'I... [Valentine, PHX, s, Elote, Pasta, It, s, Natio... [nod, personal, anchored, union, post, based, ...
34 ['08', 'October', '2024', 'ePaper', '07', 'Oct... [08, October, 2024, ePaper, 07, October, 2024,... [spread, post, leave, gear, amount, know, resp...
35 ['Search', 'and', 'rescue', 'operations', 'con... [Search, and, rescue, operations, continued, i... [headline, also, spread, post, drainage, gear,...
36 ['September', '24', '2024', '10', '24', 'ET', ... [September, 24, 2024, 10, 24, ET, Source, Exac... [alkalinity, addition, gaining, food, found, s...
37 ['PLOS', 'Neglected', 'Tropical', 'Diseases', ... [PLOS, Neglected, Tropical, Diseases, is, the,... [compare, based, topic, addition, calculated, ...
38 ['October', '03', '2024', '08', '05', 'ET', 'S... [October, 03, 2024, 08, 05, ET, Source, Beyond... [community, social, passion, book, fostering, ...
39 ['Article', 'Text', 'PDF', 'PDF', 'Supplementa... [Article, Text, PDF, PDF, Supplementary, Mater... [translation, reverse, escape, available, opin...
40 ['Seth', 'Borenstein', 'Associated', 'Press', ... [Seth, Borenstein, Associated, Press, Seth, Bo... [policy, amount, battle, staple, end, rest, bo...
41 ['Michael', 'Regan', 'seemed', 'like', 'he', '... [Michael, Regan, seemed, like, he, was, spoili... [strident, possibility, policy, battle, ground...
42 ['As', 'the', 'North', 'grapples', 'with', 'fl... [As, the, North, grapples, with, floods, famil... [addition, country, go, provide, bought, face,...
43 ['We', 'haven', 't', 'seen', 'a', 'disaster', ... [We, haven, t, seen, a, disaster, like, this, ... [warning, diameter, return, ground, harder, el...
44 ['Alexis', 'Christoforous', 'has', 'the', 'lat... [Alexis, Christoforous, has, the, latest, on, ... [earthquake, end, country, operation, go, atta...
45 ['PLOS', 'Neglected', 'Tropical', 'Diseases', ... [PLOS, Neglected, Tropical, Diseases, is, the,... [possibility, spread, reproduction, available,...
46 ['PLOS', 'Neglected', 'Tropical', 'Diseases', ... [PLOS, Neglected, Tropical, Diseases, is, the,... [assistance, grant, addition, investigation, c...
47 ['Insight', 'and', 'inspiration', 'in', 'turbu... [Insight, and, inspiration, in, turbulent, tim... [open, map, river, commerce, respond, guard, s...
48 ['Hello', 'Your', 'subscription', 'is', 'almos... [Hello, Your, subscription, is, almost, coming... [likely, continent, end, demise, rest, country...
49 ['If', 'you', 'weren', 't', 'fired', 'up', 'to... [If, you, weren, t, fired, up, to, play, for, ... [ground, digs, mouth, custom, escape, availabl...
50 ['The', 'following', 'is', 'a', 'near', 'verba... [The, following, is, a, near, verbatim, transc... [refer, vacuum, unbalanced, fact, phone, likel...
51 ['There', 'may', 'be', 'no', 'more', 'importan... [There, may, be, no, more, important, question... [erode, policy, topic, strategy, exceeding, av...
52 ['Tuesday', 'October', '8', '2024', 'Two', 'pe... [Tuesday, October, 8, 2024, Two, people, colle... [addition, quench, interview, country, membran...
53 ['18', 'September', '2024', 'Time', 'taken', '... [18, September, 2024, Time, taken, to, read, 1... [forest, policy, topic, grazing, earthquake, u...
54 ['Tuesday', 'October', '8', '2024', 'The', 'af... [Tuesday, October, 8, 2024, The, aftermath, of... [agriculture, damage, addition, led, critical,...
55 ['Tuesday', 'October', '8', '2024', 'Credit', ... [Tuesday, October, 8, 2024, Credit, United, Na... [satisfactory, solidarity, enhance, crucial, a...
56 ['Tuesday', 'October', '8', '2024', 'The', 'Vi... [Tuesday, October, 8, 2024, The, Vijayanagar, ... [fact, reverse, encroachment, flora, found, pl...
In [8]:
# Display transaction top 5 lines
with open('transactions_basket.txt', 'r') as f:
    for i, line in enumerate(f):
        if i < 5:
            print(line.strip())
        else:
            break
known,works,warning,know,rainfall,flooding,country,rest,reflect,hard,ground,soaking,turn,becoming,happening,cut,city,make,river,runoff,rebuild,live,difference,u,mud,professor,adequate,sign,risk,cutting,isolated,effort,eliminate,able,infrastructure,fast,think,designed,year,deadly,severity,atlas,trend,imagine,problem,situation,making,today,set,streets,hurricane,may,built,rain,hold,depend,category,disaster,chance,better,future,plan,longer,association,people,safety,much,continue,going,safe,handle
community,also,world,intense,end,country,right,spark,people,researcher,percent,one,measurement,paper,bathing,around,shrunk,tumble,statement,brazil,fallen,covered,told,transportation,port,water,jump,head,environmental,stress,weekend,north,drought,suffering,common,way,effects,everything,depth,struggling,much,revenue
aid,suspended,led,frequency,evacuation,country,contribution,electricity,person,disposed,upriver,city,place,intensity,money,met,mayor,mud,wave,increasing,fallen,said,infrastructure,five,help,well,closed,necessary,southwest,housing,carried,scale,water,governor,saying,way,people,rise,town
pacific,glass,amount,return,prepared,charge,go,theory,many,prerequisite,provide,make,found,place,large,attack,variety,physics,sealed,understanding,overuse,research,nearly,filled,merely,five,revive,lake,tiny,contact,told,declare,water,reduce,company,form,answer,emergency,combination,area,solution,add,possible,reach,continue,stave,basin,leach,rainfall,none,suppose,stud,department,experience,struck,coming,believe,starting,say,links,u,effort,university,series,lot,sides,internationally,year,application,severity,gag,exposed,subject,example,volume,reason,want,potential,instance,often,dip,partnership,cornerstone,land,early,panel,much,runoff,able,news,already,cadmium,metal,dry,world,month,least,unfolding,proceed,apply,would,evaporation,one,new,result,city,used,method,aluminium,half,worded,responsible,drastically,usual,plenty,lookout,running,top,average,remember,recipe,making,summer,industry,rain,even,winter,record,plastic,better,meant,store,climate,pineapple,seen,ask,flat,cause,know,led,grow,storage,reservoir,failure,live,could,mayor,use,come,lost,aware,defense,sold,energy,left,irrigate,fed,always,truly,precipitation,course,size,time,like,done,number,chile,globe,roof,issue,call,sewage,alone,made,without,deal
causing,post,background,resulting,open,amount,country,operation,provide,river,happen,feed,scientist,travel,panama,office,water,institute,soil,engaged,part,site,service,crept,combined,reading,crane,contribute,community,rainfall,creation,talk,growth,system,believe,canoe,week,order,modify,series,second,want,hike,monitor,drought,drop,ship,focus,civil,steal,much,rate,world,one,expand,result,filling,notice,money,waiting,pass,note,millions,necessary,well,blocked,quest,average,wait,integrity,canopy,die,importance,environmental,canal,history,colon,ton,restrict,release,authority,carry,watched,shrouded,transit,cargo,check,movement,diversity,economy,atlantic,completion,cost,region,accommodate,designed,decrease,done,share,mountainous,number,depend,protecting,challenge,lack,communication

ARM¶

In [1]:
# Load necessary libraries
library(arules)
library(arulesViz)

# Read transactions
data <- read.transactions("transactions_basket.txt", format = "basket", sep = ",")

# Generate frequent itemsets
frequent_itemsets <- apriori(data, parameter = list(supp = 0.2, target = "frequent itemsets"))

# Generate association rules
rules <- apriori(data, parameter = list(supp = 0.2, conf = 0.8, target = "rules"))

# Check if any rules were generated
if (length(rules) > 0) {
  # Print top 15 rules by support
  top_support <- sort(rules, by = "support", decreasing = TRUE)[1:15]
  cat("Top 15 rules by support:\n")
  inspect(top_support)
  
  # Print top 15 rules by confidence
  top_confidence <- sort(rules, by = "confidence", decreasing = TRUE)[1:15]
  cat("\nTop 15 rules by confidence:\n")
  inspect(top_confidence)
  
  # Print top 15 rules by lift
  top_lift <- sort(rules, by = "lift", decreasing = TRUE)[1:15]
  cat("\nTop 15 rules by lift:\n")
  inspect(top_lift)
  
  # Plot the rules
  plot(rules, method = "graph", control = list(type = "items"))
} else {
  print("No association rules were generated.")
}
Warning message:
"package 'arules' was built under R version 4.3.3"
Loading required package: Matrix


Attaching package: 'arules'


The following objects are masked from 'package:base':

    abbreviate, write


Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
         NA    0.1    1 none FALSE            TRUE       5     0.2      1
 maxlen            target  ext
     10 frequent itemsets TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 11 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3627 item(s), 56 transaction(s)] done [0.00s].
sorting and recoding items ... [67 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [292 set(s)] done [0.00s].
creating S4 object  ... done [0.00s].
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.8    0.1    1 none FALSE            TRUE       5     0.2      1
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 11 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3627 item(s), 56 transaction(s)] done [0.00s].
sorting and recoding items ... [67 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
writing ... [144 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
Top 15 rules by support:
     lhs        rhs      support   confidence coverage  lift     count
[1]  {said}  => {people} 0.3750000 0.8750000  0.4285714 1.361111 21   
[2]  {get}   => {water}  0.3035714 1.0000000  0.3035714 1.696970 17   
[3]  {time}  => {water}  0.3035714 0.8500000  0.3571429 1.442424 17   
[4]  {would} => {year}   0.2857143 0.8421053  0.3392857 1.684211 16   
[5]  {would} => {water}  0.2857143 0.8421053  0.3392857 1.429027 16   
[6]  {make}  => {way}    0.2857143 0.8421053  0.3392857 1.964912 16   
[7]  {used}  => {year}   0.2857143 0.8000000  0.3571429 1.600000 16   
[8]  {used}  => {water}  0.2857143 0.8000000  0.3571429 1.357576 16   
[9]  {time}  => {people} 0.2857143 0.8000000  0.3571429 1.244444 16   
[10] {go}    => {year}   0.2678571 1.0000000  0.2678571 2.000000 15   
[11] {think} => {year}   0.2678571 0.9375000  0.2857143 1.875000 15   
[12] {able}  => {water}  0.2678571 0.8333333  0.3214286 1.414141 15   
[13] {lot}   => {year}   0.2500000 0.8750000  0.2857143 1.750000 14   
[14] {far}   => {people} 0.2500000 1.0000000  0.2500000 1.555556 14   
[15] {go}    => {water}  0.2500000 0.9333333  0.2678571 1.583838 14   

Top 15 rules by confidence:
     lhs               rhs      support   confidence coverage  lift     count
[1]  {far}          => {people} 0.2500000 1          0.2500000 1.555556 14   
[2]  {go}           => {year}   0.2678571 1          0.2678571 2.000000 15   
[3]  {want}         => {water}  0.2500000 1          0.2500000 1.696970 14   
[4]  {get}          => {water}  0.3035714 1          0.3035714 1.696970 17   
[5]  {lot, would}   => {year}   0.2142857 1          0.2142857 2.000000 12   
[6]  {lot, used}    => {year}   0.2142857 1          0.2142857 2.000000 12   
[7]  {get, go}      => {year}   0.2142857 1          0.2142857 2.000000 12   
[8]  {get, go}      => {water}  0.2142857 1          0.2142857 1.696970 12   
[9]  {go, would}    => {year}   0.2142857 1          0.2142857 2.000000 12   
[10] {go, would}    => {water}  0.2142857 1          0.2142857 1.696970 12   
[11] {go, people}   => {year}   0.2142857 1          0.2142857 2.000000 12   
[12] {go, water}    => {year}   0.2500000 1          0.2500000 2.000000 14   
[13] {going, think} => {year}   0.2321429 1          0.2321429 2.000000 13   
[14] {going, way}   => {know}   0.2142857 1          0.2142857 2.800000 12   
[15] {want, year}   => {water}  0.2142857 1          0.2142857 1.696970 12   

Top 15 rules by lift:
     lhs                     rhs     support   confidence coverage  lift    
[1]  {know, think, year}  => {going} 0.2142857 1.0000000  0.2142857 3.733333
[2]  {going, know, year}  => {think} 0.2142857 1.0000000  0.2142857 3.500000
[3]  {get, year}          => {go}    0.2142857 0.9230769  0.2321429 3.446154
[4]  {know, think}        => {going} 0.2142857 0.9230769  0.2321429 3.446154
[5]  {get, water, year}   => {go}    0.2142857 0.9230769  0.2321429 3.446154
[6]  {going, year}        => {think} 0.2321429 0.9285714  0.2500000 3.250000
[7]  {think, year}        => {going} 0.2321429 0.8666667  0.2678571 3.235556
[8]  {going, know}        => {think} 0.2142857 0.9230769  0.2321429 3.230769
[9]  {know, year}         => {going} 0.2142857 0.8571429  0.2500000 3.200000
[10] {water, would, year} => {go}    0.2142857 0.8571429  0.2500000 3.200000
[11] {going}              => {think} 0.2321429 0.8666667  0.2678571 3.033333
[12] {think}              => {going} 0.2321429 0.8125000  0.2857143 3.033333
[13] {know, year}         => {think} 0.2142857 0.8571429  0.2500000 3.000000
[14] {way, year}          => {think} 0.2142857 0.8571429  0.2500000 3.000000
[15] {know, way}          => {going} 0.2142857 0.8000000  0.2678571 2.986667
     count
[1]  12   
[2]  12   
[3]  12   
[4]  12   
[5]  12   
[6]  13   
[7]  13   
[8]  12   
[9]  12   
[10] 12   
[11] 13   
[12] 13   
[13] 12   
[14] 12   
[15] 12   
Warning message:
"Unknown control parameters: type"
Available control parameters (with default values):
layout	 =  stress
circular	 =  FALSE
ggraphdots	 =  NULL
edges	 =  <environment>
nodes	 =  <environment>
nodetext	 =  <environment>
colors	 =  c("#EE0000FF", "#EEEEEEFF")
engine	 =  ggplot2
max	 =  100
verbose	 =  FALSE
Warning message:
"Too many rules supplied. Only plotting the best 100 using 'lift' (change control parameter max if needed)."
No description has been provided for this image