Comparison with SQL¶
Since many potential pandas users have some familiarity with SQL, this page is meant to provide some examples of how various SQL operations would be performed using pandas.
If you’re new to pandas, you might want to first read through 10 Minutes to pandas to familiarize yourself with the library.
As is customary, we import pandas and NumPy as follows:
In [1]: import pandas as pd
In [2]: import numpy as np
Most of the examples will utilize the tips
dataset found within pandas tests. We’ll read
the data into a DataFrame called tips and assume we have a database table of the same name and
structure.
In [3]: url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv'
In [4]: tips = pd.read_csv(url)
---------------------------------------------------------------------------
ConnectionRefusedError Traceback (most recent call last)
/usr/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1347 h.request(req.get_method(), req.selector, req.data, headers,
-> 1348 encode_chunked=req.has_header('Transfer-encoding'))
1349 except OSError as err: # timeout error
/usr/lib/python3.7/http/client.py in request(self, method, url, body, headers, encode_chunked)
1243 """Send a complete request to the server."""
-> 1244 self._send_request(method, url, body, headers, encode_chunked)
1245
/usr/lib/python3.7/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1289 body = _encode(body, 'body')
-> 1290 self.endheaders(body, encode_chunked=encode_chunked)
1291
/usr/lib/python3.7/http/client.py in endheaders(self, message_body, encode_chunked)
1238 raise CannotSendHeader()
-> 1239 self._send_output(message_body, encode_chunked=encode_chunked)
1240
/usr/lib/python3.7/http/client.py in _send_output(self, message_body, encode_chunked)
1025 del self._buffer[:]
-> 1026 self.send(msg)
1027
/usr/lib/python3.7/http/client.py in send(self, data)
965 if self.auto_open:
--> 966 self.connect()
967 else:
/usr/lib/python3.7/http/client.py in connect(self)
1398
-> 1399 super().connect()
1400
/usr/lib/python3.7/http/client.py in connect(self)
937 self.sock = self._create_connection(
--> 938 (self.host,self.port), self.timeout, self.source_address)
939 self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
/usr/lib/python3.7/socket.py in create_connection(address, timeout, source_address)
730 if err is not None:
--> 731 raise err
732 else:
/usr/lib/python3.7/socket.py in create_connection(address, timeout, source_address)
719 sock.bind(source_address)
--> 720 sock.connect(sa)
721 # Break explicitly a reference cycle
ConnectionRefusedError: [Errno 146] Connection refused
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
<ipython-input-4-8ab2297b7141> in <module>()
----> 1 tips = pd.read_csv(url)
/ws/builds/jenkins/ws/du3/components/pandas/build/pandas-0.23.3+dfsg/debian/python3-pandas/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
676 skip_blank_lines=skip_blank_lines)
677
--> 678 return _read(filepath_or_buffer, kwds)
679
680 parser_f.__name__ = name
/ws/builds/jenkins/ws/du3/components/pandas/build/pandas-0.23.3+dfsg/debian/python3-pandas/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
422 compression = _infer_compression(filepath_or_buffer, compression)
423 filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
--> 424 filepath_or_buffer, encoding, compression)
425 kwds['compression'] = compression
426
/ws/builds/jenkins/ws/du3/components/pandas/build/pandas-0.23.3+dfsg/debian/python3-pandas/usr/lib/python3/dist-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
193
194 if _is_url(filepath_or_buffer):
--> 195 req = _urlopen(filepath_or_buffer)
196 content_encoding = req.headers.get('Content-Encoding', None)
197 if content_encoding == 'gzip':
/usr/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
/usr/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
523 req = meth(req)
524
--> 525 response = self._open(req, data)
526
527 # post-process response
/usr/lib/python3.7/urllib/request.py in _open(self, req, data)
541 protocol = req.type
542 result = self._call_chain(self.handle_open, protocol, protocol +
--> 543 '_open', req)
544 if result:
545 return result
/usr/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
/usr/lib/python3.7/urllib/request.py in https_open(self, req)
1389 def https_open(self, req):
1390 return self.do_open(http.client.HTTPSConnection, req,
-> 1391 context=self._context, check_hostname=self._check_hostname)
1392
1393 https_request = AbstractHTTPHandler.do_request_
/usr/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1348 encode_chunked=req.has_header('Transfer-encoding'))
1349 except OSError as err: # timeout error
-> 1350 raise URLError(err)
1351 r = h.getresponse()
1352 except:
URLError: <urlopen error [Errno 146] Connection refused>
In [5]: tips.head()