Project description

Py Collector

Py Collector is a simple, reliable, DB agnostic framework for consistently collecting data from any source.

It utilizes two main components, the Collector and the Scheduler.

Checkout the imports to run the examples here

Code Examples

Data	Code
Collect Weather Data into SQL Alchemy	class Weather(Collector): start_time = datetime.now()#to start immediatly scheduler = Scheduler(days=1/24, count=1, separator=1, start_time = start_time) def upload(self): ''' Runs on schedule, and will only run if is_new returns true''' r = requests.get('https://api.weather.gov/gridpoints/FWD/59,23/forecast') data = r.json()['properties']['periods'] points = [] for i in data: data_point = WeatherDataPoint( start_date=datetime.fromisoformat(i['startTime']), end_date=datetime.fromisoformat(i['endTime']), temp=i['temperature'], windspeed=i['windSpeed'] ) points.append(data_point) session.add_all(points) session.commit() def is_new(self): '''Evaluates if the data should be uploaded, if it only returns True, then it will just upload on schedule.''' return True
Collect Energy Data into a CSV every minute	class Energy(Collector): start_time = datetime.now()#to start immediatly scheduler = Scheduler(minutes=1, #every minute count=2, #try 3 times separator=2, #two seconds between tries start_time = start_time) first_run = True last_update = None def upload(self): ''' Runs on schedule, and will only run if is_new returns true''' df = pd.read_html(self.get_site.text)[0] title = 'ercot_dam_clearing_'+self.last_update.strftime('%m_%d_%Y')+'.csv' file = open(title,'w') df.to_csv(file) def is_new(self): '''Evaluates if the data should be uploaded, if it only returns True, then it will just upload on schedule.''' if self.first_run: #first run load whatever is there self.first_run = False self.last_update = self.get_last_changed() return True else: #if it has changed since we last updated, download last_changed = self.get_last_changed() if self.last_update < last_changed: self.last_update = last_changed return True else: return False def get_site(self): return requests.get('http://www.ercot.com/content/cdr/html/actual_loads_of_forecast_zones') def soup(self): r = self.get_site() return BeautifulSoup(r.text,'html.parser') def get_last_changed(self): soup = self.soup() last_change = soup.find('div',attrs={'class':'schedTime rightAlign'}) last_change = last_change.text.split('Time:')[1].lstrip() return datetime.strptime(last_change,'%b %d, %Y %H:%M')
Collect TikTok Data into a MongoDB every day	class TikTokUser(MongoModel): username = fields.CharField() followers = fields.CharField() likes = fields.CharField() following = fields.CharField() class Meta: write_concern = WriteConcern(j=True) connection_alias = 'my-app' class TikTok(Collector): start_time = datetime.now() scheduler = Scheduler(days=1, #every date count=1, #try 1 times separator=1, #not applicable start_time = start_time) #start now def upload(self): ''' Runs on schedule, and will only run if is_new returns true''' data = self.user_stats('gordonramsayofficial') user = TikTokUser.from_document(data) user.save() def is_new(self): '''Evaluates if the data should be uploaded, if it only returns True, then it will just upload on schedule.''' return True def user_stats(self,user ='gordonramsayofficial'): r = self.user_raw(user) soup = BeautifulSoup(r.text,'html.parser') info = soup.find('h2',attrs={'class':'count-infos'}) return { 'following':info.find('strong',attrs={'title':'Following'}).text, 'followers':info.find('strong',attrs={'title':'Followers'}).text, 'likes':info.find('strong',attrs={'title':'Likes'}).text, 'username':user } def user_raw(self, user): headers={ "authority": "m.tiktok.com", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0", "method": "GET", "scheme": "https", "accept": "application/json, text/plain, /", "accept-encoding": 'gzip, deflate, utf-8', "accept-language": "en-US,en;q=0.9", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-site", "sec-gpc": "1" } return requests.get(f'https://www.tiktok.com/@{user}?lang=en',headers=headers)

Data

Code

Collect Weather Data into SQL Alchemy

class Weather(Collector):
    start_time = datetime.now()#to start immediatly
    scheduler = Scheduler(days=1/24, 
                        count=1, 
                        separator=1,
                        start_time = start_time)
    
    def upload(self):
        ''' Runs on schedule, and will only run if is_new 
            returns true'''
        r = requests.get('https://api.weather.gov/gridpoints/FWD/59,23/forecast')
        data = r.json()['properties']['periods']
        points = []
        for i in data:
            data_point = WeatherDataPoint(
                start_date=datetime.fromisoformat(i['startTime']),
                end_date=datetime.fromisoformat(i['endTime']),
                temp=i['temperature'],
                windspeed=i['windSpeed']
            )
            points.append(data_point)

        session.add_all(points)
        session.commit()

    def is_new(self):
        '''Evaluates if the data should be uploaded,
        if it only returns True, then it will just upload 
        on schedule.'''
        return True

Collect Energy Data into a CSV every minute

class Energy(Collector):
    start_time = datetime.now()#to start immediatly

    scheduler = Scheduler(minutes=1, #every minute
                        count=2, #try 3 times
                        separator=2, #two seconds between tries
                        start_time = start_time)
    first_run = True
    last_update = None

    def upload(self):
        ''' Runs on schedule, and will only run if is_new 
            returns true'''
        df = pd.read_html(self.get_site.text)[0]
        title = 'ercot_dam_clearing_'+self.last_update.strftime('%m_%d_%Y')+'.csv'
        file = open(title,'w')
        df.to_csv(file)

    def is_new(self):
        '''Evaluates if the data should be uploaded,
        if it only returns True, then it will just upload 
        on schedule.'''
        if self.first_run:
            #first run load whatever is there
            self.first_run = False
            self.last_update = self.get_last_changed()
            return True
        else:
            #if it has changed since we last updated, download
            last_changed = self.get_last_changed()
            if self.last_update < last_changed:
                self.last_update = last_changed
                return True
            else:
                return False

    def get_site(self):
        return requests.get('http://www.ercot.com/content/cdr/html/actual_loads_of_forecast_zones')

    def soup(self):
        r = self.get_site()
        return BeautifulSoup(r.text,'html.parser')

    def get_last_changed(self):
        soup = self.soup()
        last_change = soup.find('div',attrs={'class':'schedTime rightAlign'})
        last_change = last_change.text.split('Time:')[1].lstrip()
        return datetime.strptime(last_change,'%b %d, %Y %H:%M')

Collect TikTok Data into a MongoDB every day

class TikTokUser(MongoModel):
    username = fields.CharField()
    followers = fields.CharField()
    likes = fields.CharField()
    following = fields.CharField()

    class Meta:
        write_concern = WriteConcern(j=True)
        connection_alias = 'my-app'

class TikTok(Collector):
    start_time = datetime.now() 

    scheduler = Scheduler(days=1, #every date
                        count=1, #try 1 times
                        separator=1, #not applicable
                        start_time = start_time) #start now

    def upload(self):
        ''' Runs on schedule, and will only run if is_new 
            returns true'''

        data = self.user_stats('gordonramsayofficial')
        user = TikTokUser.from_document(data)
        user.save()

    def is_new(self):
        '''Evaluates if the data should be uploaded,
        if it only returns True, then it will just upload 
        on schedule.'''
        return True
    
    def user_stats(self,user ='gordonramsayofficial'):
            r = self.user_raw(user)
            soup = BeautifulSoup(r.text,'html.parser')
            info = soup.find('h2',attrs={'class':'count-infos'})
            return {
                'following':info.find('strong',attrs={'title':'Following'}).text,
                'followers':info.find('strong',attrs={'title':'Followers'}).text,
                'likes':info.find('strong',attrs={'title':'Likes'}).text, 
                'username':user
            }

    def user_raw(self, user):
        headers={
            "authority": "m.tiktok.com",
            "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0",
            "method": "GET",
            "scheme": "https",
            "accept": "application/json, text/plain, */*",
            "accept-encoding": 'gzip, deflate, utf-8',
            "accept-language": "en-US,en;q=0.9",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-site",
            "sec-gpc": "1"
            }
        return requests.get(f'https://www.tiktok.com/@{user}?lang=en',headers=headers)

Project details

Development Status
- 3 - Alpha
Environment
- Console
License
- OSI Approved :: MIT License
Operating System
- OS Independent
Programming Language
- Python :: 3

Release history Release notifications | RSS feed

This version

0.0.25

Dec 31, 2021

0.0.24

Jul 3, 2021

0.0.23

Jun 29, 2021

0.0.22

Jun 29, 2021

0.0.21

Jun 29, 2021

0.0.3

Jun 29, 2021

0.0.2

Jun 29, 2021

0.0.1

Jun 29, 2021

Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

py_collector-0.0.25.tar.gz (5.4 kB view hashes)

Uploaded Dec 31, 2021 Source

Built Distribution

py_collector-0.0.25-py3-none-any.whl (5.5 kB view hashes)

Uploaded Dec 31, 2021 Python 3

Hashes for py_collector-0.0.25.tar.gz

Hashes for py_collector-0.0.25.tar.gz
Algorithm	Hash digest
SHA256	`6c90cad2401f36fa8dc18298d03775e06137c439871b9692c0a9bf01344abf77`
MD5	`7d81f0748be4273b7743b9778edbf08c`
BLAKE2b-256	`913fb38ff8470b8a1ea34b7fdb0e1468f8076d10f0c235f9fcc9c81bd5719c50`

Hashes for py_collector-0.0.25-py3-none-any.whl

Hashes for py_collector-0.0.25-py3-none-any.whl
Algorithm	Hash digest
SHA256	`104d0a225ae8964bd9c2f81980a90486c2db124073188e6cd4be10f1666736ba`
MD5	`8fa59d1819e196cd367ede89010c6998`
BLAKE2b-256	`4ee481e9f3d2ca055ea37ecfa991fb7b6844f4cbaa20dd3152f2890cee67f271`