Merge pull request #35 from codez0mb1e/data-crawlers

Data crawlers
This commit is contained in:
Dmitry Petukhov 2022-10-22 12:28:01 +03:00 committed by GitHub
commit 5cff2c0eda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 339 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/data/*.csv
/data/*.zip

337
src/binance_open_data.ipynb Normal file
View File

@ -0,0 +1,337 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Binance Open Data lab"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[Binance Open Data](https://github.com/binance/binance-public-data/#klines) and analyze it.\n",
"\n",
"### Stet 1. Download data \n",
"\n",
"Downloading __1-minute candles__ for `BTC/USDT` and `BTC/UDSC` using `bash` or `powershell` scripts:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "shellscript"
}
},
"outputs": [],
"source": [
"#!/bin/sh\n",
"\n",
"# create dir for data\n",
"!mkdir ../data\n",
"\n",
"# download data using GET request\n",
"!wget -N -P ../data https://data.binance.vision/data/spot/daily/klines/BTCUSDT/1m/BTCUSDT-1m-2022-06-21.zip\n",
"!wget -N -P../data https://data.binance.vision/data/spot/daily/klines/BTCUSDC/1m/BTCUSDC-1m-2022-06-21.zip\n",
"\n",
"# unzip\n",
"!unzip -o -d ../data ../data/BTCUSDT-1m-2022-06-21.zip \n",
"!unzip -o -d ../data ../data/BTCUSDC-1m-2022-06-21.zip"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 2: Import data to Dataframe \n",
"\n",
"Import packages for data analysis:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import httpx\n",
"\n",
"from datetime import datetime"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import data from CSV file to Pandas DataFrame:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_data(pair: str) -> pd.DataFrame:\n",
" return pd.read_csv(f'../data/{pair}-1m-2022-06-21.csv', header = None)\n",
"\n",
"btcusdt_df = get_data('BTCUSDT')\n",
"btcusdt_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set names to columns:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def set_column_names(df: pd.DataFrame) -> pd.DataFrame:\n",
" column_names_mapping = {\n",
" 0: 'Open_time',\n",
" 1: 'Open',\n",
" 2: 'High',\n",
" 3: 'Low',\n",
" 4: 'Close',\n",
" 5: 'Volume',\n",
" 6: 'Close_time',\n",
" 7: 'Quote_asset_volume',\n",
" 8: 'Number_of_trades',\n",
" 9: 'Taker_buy_base_asset_volume',\n",
" 10: 'Taker_buy_quote_asset_volume',\n",
" 11: 'Ignore'\n",
" }\n",
" return df.rename(columns=column_names_mapping)\n",
"\n",
"btcusdt_df = set_column_names(btcusdt_df)\n",
"btcusdt_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Convert timestamp to human-readable date and time format:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"btcusdt_df['Open_time'] = btcusdt_df.iloc[:, 0].apply(lambda t: datetime.fromtimestamp(t/1000))\n",
"btcusdt_df['Close_time'] = btcusdt_df.iloc[:, 6].apply(lambda t: datetime.fromtimestamp(t/1000))\n",
"\n",
"btcusdt_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's take a look at _Descriptive statistics_ (min, mean, max, standard deviation):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"btcusdt_df.describe(datetime_is_numeric=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 2: Transform data\n",
"\n",
"Calculate __1-hour OHLCV__ candles:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def calculate_ohclv(df: pd.DataFrame) -> pd.DataFrame:\n",
" df['hour'] = df['Close_time'].apply(lambda t: t.hour)\n",
"\n",
" return (\n",
" df\n",
" .groupby(['hour'])\n",
" .agg(\n",
" {\n",
" 'Open': 'first',\n",
" 'High': max,\n",
" 'Low': min,\n",
" 'Close': 'last',\n",
" 'Volume': sum,\n",
" 'Close_time': max\n",
" }\n",
" )\n",
" .reset_index()\n",
" .drop(columns=['hour'])\n",
" )\n",
"\n",
"btcusdt_1h_df = calculate_ohclv(btcusdt_df)\n",
"\n",
"btcusdt_1h_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data validation is very important. Let's write domain-driven asserts:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert(\n",
" isinstance(btcusdt_1h_df, pd.DataFrame)\n",
" and btcusdt_1h_df.shape == (24, 6)\n",
" and not btcusdt_1h_df.isnull().any().any()\n",
" and btcusdt_1h_df.iloc[:, 0:5].ge(0).all().all()\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 3: Expand the dataset with information about `BTC/USDC` \n",
"\n",
"Download `BTC/USDC` 1-minute candles and transform it to 1-hour candles:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"btcusdc_df = get_data('BTCUSDC') # download data\n",
"btcusdc_df = set_column_names(btcusdc_df) # set column names\n",
"btcusdc_df['Close_time'] = btcusdc_df.iloc[:, 6].apply(lambda t: datetime.fromtimestamp(t/1000)) # convert timestamp to date+time\n",
"\n",
"btcusdc_1h_df = calculate_ohclv(btcusdc_df) # calculate 1h OHCLV candles\n",
"btcusdc_1h_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Join altogether:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"btcusdt_1h_df['pair'] = 'BTC-USDT'\n",
"btcusdc_1h_df['pair'] = 'BTC-USDC'\n",
"\n",
"# Join datasets\n",
"candles_1h_df = pd.concat([btcusdt_1h_df, btcusdc_1h_df])\n",
"\n",
"# Validate result\n",
"assert(\n",
" isinstance(candles_1h_df, pd.DataFrame)\n",
" and candles_1h_df.shape == (48, 7)\n",
" and (candles_1h_df['pair'].unique() == ['BTC-USDT', 'BTC-USDC']).all()\n",
")\n",
"\n",
"# Sort output by Close_time\n",
"candles_1h_df.sort_values('Close_time')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (Optional) Use Binance API"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# %%\n",
"import pandas as pd \n",
"from binance.client import Client\n",
"\n",
"\n",
"# %%\n",
"api_key = \"****\"\n",
"secret_key = \"***\"\n",
"\n",
"client = Client(api_key, secret_key)\n",
"\n",
"\n",
"# %%\n",
"coins_response = client.get_all_coins_info()\n",
"coins_df = pd.DataFrame.from_dict(coins_response, orient='columns')\n",
"\n",
"\n",
"# %%\n",
"pairs_list = coins_df.coin.apply(lambda x: f\"{x}USDT\") \n",
"client.get_historical_klines(\n",
" 'BTCUSDT', \n",
" interval=Client.KLINE_INTERVAL_1HOUR,\n",
" start_str='2022-04-21', \n",
" end_str='2022-04-22'\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "6fd7ff10be7e3a66c1b3745c4cbc00041a2589eb74ab4be46a3698a7b56001aa"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}