This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # | |
| # Run like this: | |
| # beaker experiment logs <experiment id> | python logs.py | |
| # | |
| import sys | |
| import re | |
| from collections import defaultdict, Counter | |
| from typing import List, Dict |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import mmap | |
| import xxhash | |
| def _checksum_artifact(path: PathOrStr) -> str: | |
| filepath = Path(path) | |
| if not filepath.is_file(): | |
| raise FileNotFoundError(str(filepath)) | |
| h = xxhash.xxh128() | |
| with filepath.open("rb") as f: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| float32: 3.71 s ± 2.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
| float16: 2.29 s ± 8.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
| bfloat16: 2.29 s ± 9.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) | |
| torch.backends.cuda.matmul.allow_tf32 = False | |
| float32: 24.4 s ± 41.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import dill | |
| import mmh3 | |
| import typing | |
| import io | |
| def hash_object(o: typing.Any) -> str: | |
| with io.BytesIO() as buffer: | |
| dill.dump(o, buffer) | |
| return mmh3.hash_bytes(buffer.getvalue(), x64arch=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| STOPWORDS = { | |
| "i", | |
| "me", | |
| "my", | |
| "myself", | |
| "we", | |
| "our", | |
| "ours", | |
| "ourselves", | |
| "you", |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import collections | |
| import typing | |
| _T = typing.TypeVar('_T') | |
| def filter_near_duplicates(items: typing.Iterable[_T], key = lambda x: x) -> typing.Generator[_T, None, None]: | |
| """Filters out items that overlap too much with items we've seen earlier in the sequence.""" | |
| trigram_to_sentence_indices = collections.defaultdict(set) | |
| for sentence_index, item in enumerate(items): | |
| sentence = key(item) | |
| trigrams = [sentence[i:i+3] for i in range(len(sentence) - 1)] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import * | |
| import time | |
| import logging | |
| def logging_tqdm( | |
| i, | |
| *, | |
| logger: Optional[logging.Logger] = None, | |
| desc: str = "Working", | |
| total: Optional[int] = None, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def distinctFromIterator[T](input: Iterator[T]): Iterator[T] = new Iterator[T] { | |
| private val seen: mutable.Set[T] = mutable.Set[T]() | |
| private def findNextItem(): Option[T] = { | |
| if(input.hasNext) { | |
| val n = input.next() | |
| val newItem = seen.add(n) | |
| if(newItem) | |
| Some(n) | |
| else | |
| findNextItem() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def linesFromFile(filename: String): Iterator[String] = new Iterator[String] { | |
| val bufferedReader = { | |
| val fileInputStream = new FileInputStream(filename) | |
| val decompressedInputStream = | |
| if(filename.endsWith(".gz")) new GZIPInputStream(fileInputStream) else fileInputStream | |
| val reader = new InputStreamReader(decompressedInputStream, "UTF-8") | |
| new BufferedReader(reader) | |
| } | |
| private var nextLine = bufferedReader.readLine() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import * | |
| import multiprocessing as mp | |
| def mp_map(fn, input_sequence: Iterable) -> Iterable: | |
| input_queue = mp.Queue() | |
| output_queue = mp.Queue() | |
| def process_items(): | |
| while True: | |
| item = input_queue.get() |
NewerOlder