Created
July 4, 2023 09:51
-
-
Save josifoski/d2d3e787d758168a162c25957e7ba678 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3+ | |
| # script for scraping Bible texts from bible.com | |
| # creator: Aleksandar Josifoski for Troy Lyndon troylyndon@gmail.com property of http://rdgames.us | |
| # 2016-03-23 | |
| # http://bible.com/versions | |
| # for server based scrapping, with heart recommended ♡ pythonanywhere ♡ | |
| ## INPUT ##################################################################################################################### | |
| # First, take a look at bc_keys.py and pick keys for which translations to be scrapped | |
| # Second, add in pool which Bible translations to be scrapped, put only number in quotes like pool = [ "86", "110"] | |
| # for example ENG_KJV have key "1" | |
| pool = [ | |
| ] | |
| # if you want to scrape only one book, or some of them reduce bookslist | |
| bookslist = [ "gen","exo","lev","num","deu","jos","jdg","rut","1sa","2sa","1ki","2ki","1ch","2ch","ezr","neh","est","job","psa","pro","ecc","sng","isa","jer","lam","ezk","dan","hos","jol","amo","oba","jon","mic","nam","hab","zep","hag","zec","mal","mat","mrk","luk","jhn","act","rom","1co","2co","gal","eph","php","col","1th","2th","1ti","2ti","tit","phm","heb","jas","1pe","2pe","1jn","2jn","3jn","jud","rev"] | |
| #bookslist = [ "1pe","2pe","1jn","2jn","3jn","jud" ] | |
| # note, last character must be / for first character put / for absolute path | |
| # if you omit first character /, directories will be created in current directory where python scripts are | |
| # Important!!! check rootdir and xmlrootdir in next lines | |
| #rootdir = '/data/rdgames/testing/' # again, last character must be / same as above | |
| #xmlrootdir = '/data/rdgames/testing/xml/' | |
| rootdir = '/home/josifoski/bibles/bc/' # this is on server, put # before one of them to exclude one | |
| xmlrootdir = '/home/josifoski/bibles/bcxml/' # for server scrapping put here correct username | |
| # if you want to rescrape Bible, delete statusdone generated file where Bible is saved | |
| ############################################################################################################################## | |
| from bc_keys import numtobibleabbrev | |
| from bs4 import BeautifulSoup | |
| import re | |
| import urllib.request | |
| import sys | |
| import os | |
| import time | |
| import datetime | |
| import random | |
| import html | |
| import codecs | |
| import zipfile | |
| dabbrevbook = { "gen":"Genesis", "exo":"Exodus","lev":"Leviticus","num":"Numbers","deu":"Deuteronomy","jos":"Joshua","jdg":"Judges","rut":"Ruth", | |
| "1sa":"1Samuel","2sa":"2Samuel","1ki":"1Kings","2ki":"2Kings","1ch":"1Chronicles","2ch":"2Chronicles","ezr":"Ezra","neh":"Nehemiah", | |
| "est":"Esther","job":"Job","psa":"Psalms","pro":"Proverbs","ecc":"Ecclesiastes","sng":"SongofSolomon","isa":"Isaiah","jer":"Jeremiah", | |
| "lam":"Lamentations","ezk":"Ezekiel","dan":"Daniel","hos":"Hosea","jol":"Joel","amo":"Amos","oba":"Obadiah","jon":"Jonah","mic":"Micah", | |
| "nam":"Nahum","hab":"Habakkuk","zep":"Zephaniah","hag":"Haggai","zec":"Zechariah","mal":"Malachi","mat":"Matthew","mrk":"Mark","luk":"Luke", | |
| "jhn":"John","act":"Acts","rom":"Romans","1co":"1Corinthians","2co":"2Corinthians","gal":"Galatians","eph":"Ephesians","php":"Philippians", | |
| "col":"Colossians","1th":"1Thessalonians","2th":"2Thessalonians","1ti":"1Timothy","2ti":"2Timothy","tit":"Titus","phm":"Philemon", | |
| "heb":"Hebrews","jas":"James","1pe":"1Peter","2pe":"2Peter","1jn":"1John","2jn":"2John","3jn":"3John","jud":"Jude","rev":"Revelation" } | |
| dprefixes = { | |
| "Genesis":"01", "Exodus":"02", "Leviticus":"03", "Numbers":"04", "Deuteronomy":"05", "Joshua":"06", "Judges":"07", "Ruth":"08", "1Samuel":"09", | |
| "2Samuel":"10", "1Kings":"11", "2Kings":"12", "1Chronicles":"13", "2Chronicles":"14", "Ezra":"15", "Nehemiah":"16", "Esther":"17", "Job":"18", | |
| "Psalms":"19", "Proverbs":"20", "Ecclesiastes":"21", "SongofSolomon":"22", "Isaiah":"23", "Jeremiah":"24", "Lamentations":"25", "Ezekiel":"26", | |
| "Daniel":"27", "Hosea":"28", "Joel":"29", "Amos":"30", "Obadiah":"31", "Jonah":"32", "Micah":"33", "Nahum":"34", "Habakkuk":"35", "Zephaniah":"36", | |
| "Haggai":"37", "Zechariah":"38", "Malachi":"39", "Matthew":"40", "Mark":"41", "Luke":"42", "John":"43", "Acts":"44", "Romans":"45", | |
| "1Corinthians":"46", "2Corinthians":"47", "Galatians":"48", "Ephesians":"49", "Philippians":"50", "Colossians":"51", "1Thessalonians":"52", | |
| "2Thessalonians":"53", "1Timothy":"54", "2Timothy":"55", "Titus":"56", "Philemon":"57", "Hebrews":"58", "James":"59", "1Peter":"60", | |
| "2Peter":"61", "1John":"62", "2John":"63", "3John":"64", "Jude":"65", "Revelation":"66" } | |
| booksdict = { "Genesis" : 50, "Exodus" : 40, "Leviticus" : 27, "Numbers" : 36, "Deuteronomy" : 34, "Joshua" : 24, "Judges" : 21, "Ruth" : 4, | |
| "1Samuel" : 31, "2Samuel" : 24, "1Kings" : 22, "2Kings" : 25, "1Chronicles" : 29, "2Chronicles" : 36, "Ezra" : 10, "Nehemiah" : 13, "Esther" : 10, | |
| "Job" : 42, "Psalms" : 150, "Proverbs" : 31, "Ecclesiastes" : 12, "SongofSolomon" : 8, "Isaiah" : 66, "Jeremiah" : 52, "Lamentations" : 5, | |
| "Ezekiel" : 48, "Daniel" : 12, "Hosea" : 14, "Joel" : 3, "Amos" : 9, "Obadiah" : 1, "Jonah" : 4, "Micah" : 7, "Nahum" : 3, "Habakkuk" : 3, | |
| "Zephaniah" : 3, "Haggai" : 2, "Zechariah" : 14, "Malachi" : 4, "Matthew" : 28, "Mark" : 16, "Luke" : 24, "John" : 21, "Acts" : 28, "Romans" : 16, | |
| "1Corinthians" : 16, "2Corinthians" : 13, "Galatians" : 6, "Ephesians" : 6, "Philippians" : 4, "Colossians" : 4, "1Thessalonians" : 5, | |
| "2Thessalonians" : 3, "1Timothy" : 6, "2Timothy" : 4, "Titus" : 3, "Philemon" : 1, "Hebrews" : 13, "James" : 5, "1Peter" : 5, "2Peter" : 3, | |
| "1John" : 5, "2John" : 1, "3John" : 1, "Jude" : 1, "Revelation" : 22 } | |
| Bibliaa = { | |
| 'Genesis' : ['1:31','2:25','3:24','4:26','5:32','6:22','7:24','8:22','9:29','10:32','11:32','12:20','13:18','14:24','15:21','16:16','17:27','18:33','19:38','20:18','21:34','22:24','23:20','24:67','25:34','26:35','27:46','28:22','29:35','30:43','31:55','32:32','33:20','34:31','35:29','36:43','37:36','38:30','39:23','40:23','41:57','42:38','43:34','44:34','45:28','46:34','47:31','48:22','49:33','50:26'], | |
| 'Exodus' : ['1:22','2:25','3:22','4:31','5:23','6:30','7:25','8:32','9:35','10:29','11:10','12:51','13:22','14:31','15:27','16:36','17:16','18:27','19:25','20:26','21:36','22:31','23:33','24:18','25:40','26:37','27:21','28:43','29:46','30:38','31:18','32:35','33:23','34:35','35:35','36:38','37:29','38:31','39:43','40:38'], | |
| 'Leviticus' : ['1:17','2:16','3:17','4:35','5:19','6:30','7:38','8:36','9:24','10:20','11:47','12:8','13:59','14:57','15:33','16:34','17:16','18:30','19:37','20:27','21:24','22:33','23:44','24:23','25:55','26:46','27:34'], | |
| 'Numbers' : ['1:54','2:34','3:51','4:49','5:31','6:27','7:89','8:26','9:23','10:36','11:35','12:16','13:33','14:45','15:41','16:50','17:13','18:32','19:22','20:29','21:35','22:41','23:30','24:25','25:18','26:65','27:23','28:31','29:40','30:16','31:54','32:42','33:56','34:29','35:34','36:13'], | |
| 'Deuteronomy' : ['1:46','2:37','3:29','4:49','5:33','6:25','7:26','8:20','9:29','10:22','11:32','12:32','13:18','14:29','15:23','16:22','17:20','18:22','19:21','20:20','21:23','22:30','23:25','24:22','25:19','26:19','27:26','28:68','29:29','30:20','31:30','32:52','33:29','34:12'], | |
| 'Joshua' : ['1:18','2:24','3:17','4:24','5:15','6:27','7:26','8:35','9:27','10:43','11:23','12:24','13:33','14:15','15:63','16:10','17:18','18:28','19:51','20:9','21:45','22:34','23:16','24:33'], | |
| 'Judges' : ['1:36','2:23','3:31','4:24','5:31','6:40','7:25','8:35','9:57','10:18','11:40','12:15','13:25','14:20','15:20','16:31','17:13','18:31','19:30','20:48','21:25'], | |
| 'Ruth' : ['1:22','2:23','3:18','4:22'], | |
| '1Samuel' : ['1:28','2:36','3:21','4:22','5:12','6:21','7:17','8:22','9:27','10:27','11:15','12:25','13:23','14:52','15:35','16:23','17:58','18:30','19:24','20:42','21:15','22:23','23:29','24:22','25:44','26:25','27:12','28:25','29:11','30:31','31:13'], | |
| '2Samuel' : ['1:27','2:32','3:39','4:12','5:25','6:23','7:29','8:18','9:13','10:19','11:27','12:31','13:39','14:33','15:37','16:23','17:29','18:33','19:43','20:26','21:22','22:51','23:39','24:25'], | |
| '1Kings' : ['1:53','2:46','3:28','4:34','5:18','6:38','7:51','8:66','9:28','10:29','11:43','12:33','13:34','14:31','15:34','16:34','17:24','18:46','19:21','20:43','21:29','22:53'], | |
| '2Kings' : ['1:18','2:25','3:27','4:44','5:27','6:33','7:20','8:29','9:37','10:36','11:21','12:21','13:25','14:29','15:38','16:20','17:41','18:37','19:37','20:21','21:26','22:20','23:37','24:20','25:30'], | |
| '1Chronicles' : ['1:54','2:55','3:24','4:43','5:26','6:81','7:40','8:40','9:44','10:14','11:47','12:40','13:14','14:17','15:29','16:43','17:27','18:17','19:19','20:8','21:30','22:19','23:32','24:31','25:31','26:32','27:34','28:21','29:30'], | |
| '2Chronicles' : ['1:17','2:18','3:17','4:22','5:14','6:42','7:22','8:18','9:31','10:19','11:23','12:16','13:22','14:15','15:19','16:14','17:19','18:34','19:11','20:37','21:20','22:12','23:21','24:27','25:28','26:23','27:9','28:27','29:36','30:27','31:21','32:33','33:25','34:33','35:27','36:23'], | |
| 'Ezra' : ['1:11','2:70','3:13','4:24','5:17','6:22','7:28','8:36','9:15','10:44'], | |
| 'Nehemiah' : ['1:11','2:20','3:32','4:23','5:19','6:19','7:73','8:18','9:38','10:39','11:36','12:47','13:31'], | |
| 'Esther' : ['1:22','2:23','3:15','4:17','5:14','6:14','7:10','8:17','9:32','10:3'], | |
| 'Job' : ['1:22','2:13','3:26','4:21','5:27','6:30','7:21','8:22','9:35','10:22','11:20','12:25','13:28','14:22','15:35','16:22','17:16','18:21','19:29','20:29','21:34','22:30','23:17','24:25','25:6','26:14','27:23','28:28','29:25','30:31','31:40','32:22','33:33','34:37','35:16','36:33','37:24','38:41','39:30','40:24','41:34','42:17'], | |
| 'Psalms' : ['1:6','2:12','3:8','4:8','5:12','6:10','7:17','8:9','9:20','10:18','11:7','12:8','13:6','14:7','15:5','16:11','17:15','18:50','19:14','20:9','21:13','22:31','23:6','24:10','25:22','26:12','27:14','28:9','29:11','30:12','31:24','32:11','33:22','34:22','35:28','36:12','37:40','38:22','39:13','40:17','41:13','42:11','43:5','44:26','45:17','46:11','47:9','48:14','49:20','50:23','51:19','52:9','53:6','54:7','55:23','56:13','57:11','58:11','59:17','60:12','61:8','62:12','63:11','64:10','65:13','66:20','67:7','68:35','69:36','70:5','71:24','72:20','73:28','74:23','75:10','76:12','77:20','78:72','79:13','80:19','81:16','82:8','83:18','84:12','85:13','86:17','87:7','88:18','89:52','90:17','91:16','92:15','93:5','94:23','95:11','96:13','97:12','98:9','99:9','100:5','101:8','102:28','103:22','104:35','105:45','106:48','107:43','108:13','109:31','110:7','111:10','112:10','113:9','114:8','115:18','116:19','117:2','118:29','119:176','120:7','121:8','122:9','123:4','124:8','125:5','126:6','127:5','128:6','129:8','130:8','131:3','132:18','133:3','134:3','135:21','136:26','137:9','138:8','139:24','140:13','141:10','142:7','143:12','144:15','145:21','146:10','147:20','148:14','149:9','150:6'], | |
| 'Proverbs' : ['1:33','2:22','3:35','4:27','5:23','6:35','7:27','8:36','9:18','10:32','11:31','12:28','13:25','14:35','15:33','16:33','17:28','18:24','19:29','20:30','21:31','22:29','23:35','24:34','25:28','26:28','27:27','28:28','29:27','30:33','31:31'], | |
| 'Ecclesiastes' : ['1:18','2:26','3:22','4:16','5:20','6:12','7:29','8:17','9:18','10:20','11:10','12:14'], | |
| 'SongofSolomon' : ['1:17','2:17','3:11','4:16','5:16','6:13','7:13','8:14'], | |
| 'Isaiah' : ['1:31','2:22','3:26','4:6','5:30','6:13','7:25','8:22','9:21','10:34','11:16','12:6','13:22','14:32','15:9','16:14','17:14','18:7','19:25','20:6','21:17','22:25','23:18','24:23','25:12','26:21','27:13','28:29','29:24','30:33','31:9','32:20','33:24','34:17','35:10','36:22','37:38','38:22','39:8','40:31','41:29','42:25','43:28','44:28','45:25','46:13','47:15','48:22','49:26','50:11','51:23','52:15','53:12','54:17','55:13','56:12','57:21','58:14','59:21','60:22','61:11','62:12','63:19','64:12','65:25','66:24'], | |
| 'Jeremiah' : ['1:19','2:37','3:25','4:31','5:31','6:30','7:34','8:22','9:26','10:25','11:23','12:17','13:27','14:22','15:21','16:21','17:27','18:23','19:15','20:18','21:14','22:30','23:40','24:10','25:38','26:24','27:22','28:17','29:32','30:24','31:40','32:44','33:26','34:22','35:19','36:32','37:21','38:28','39:18','40:16','41:18','42:22','43:13','44:30','45:5','46:28','47:7','48:47','49:39','50:46','51:64','52:34'], | |
| 'Lamentations' : ['1:22','2:22','3:66','4:22','5:22'], | |
| 'Ezekiel' : ['1:28','2:10','3:27','4:17','5:17','6:14','7:27','8:18','9:11','10:22','11:25','12:28','13:23','14:23','15:8','16:63','17:24','18:32','19:14','20:49','21:32','22:31','23:49','24:27','25:17','26:21','27:36','28:26','29:21','30:26','31:18','32:32','33:33','34:31','35:15','36:38','37:28','38:23','39:29','40:49','41:26','42:20','43:27','44:31','45:25','46:24','47:23','48:35'], | |
| 'Daniel' : ['1:21','2:49','3:30','4:37','5:31','6:28','7:28','8:27','9:27','10:21','11:45','12:13'], | |
| 'Hosea' : ['1:11','2:23','3:5','4:19','5:15','6:11','7:16','8:14','9:17','10:15','11:12','12:14','13:16','14:9'], | |
| 'Joel' : ['1:20','2:32','3:21'], | |
| 'Amos' : ['1:15','2:16','3:15','4:13','5:27','6:14','7:17','8:14','9:15'], | |
| 'Obadiah' : ['1:21'], | |
| 'Jonah' : ['1:17','2:10','3:10','4:11'], | |
| 'Micah' : ['1:16','2:13','3:12','4:13','5:15','6:16','7:20'], | |
| 'Nahum' : ['1:15','2:13','3:19'], | |
| 'Habakkuk' : ['1:17','2:20','3:19'], | |
| 'Zephaniah' : ['1:18','2:15','3:20'], | |
| 'Haggai' : ['1:15','2:23'], | |
| 'Zechariah' : ['1:21','2:13','3:10','4:14','5:11','6:15','7:14','8:23','9:17','10:12','11:17','12:14','13:9','14:21'], | |
| 'Malachi' : ['1:14','2:17','3:18','4:6'], | |
| 'Matthew' : ['1:25','2:23','3:17','4:25','5:48','6:34','7:29','8:34','9:38','10:42','11:30','12:50','13:58','14:36','15:39','16:28','17:27','18:35','19:30','20:34','21:46','22:46','23:39','24:51','25:46','26:75','27:66','28:20'], | |
| 'Mark' : ['1:45','2:28','3:35','4:41','5:43','6:56','7:37','8:38','9:50','10:52','11:33','12:44','13:37','14:72','15:47','16:20'], | |
| 'Luke' : ['1:80','2:52','3:38','4:44','5:39','6:49','7:50','8:56','9:62','10:42','11:54','12:59','13:35','14:35','15:32','16:31','17:37','18:43','19:48','20:47','21:38','22:71','23:56','24:53'], | |
| 'John' : ['1:51','2:25','3:36','4:54','5:47','6:71','7:53','8:59','9:41','10:42','11:57','12:50','13:38','14:31','15:27','16:33','17:26','18:40','19:42','20:31','21:25'], | |
| 'Acts' : ['1:26','2:47','3:26','4:37','5:42','6:15','7:60','8:40','9:43','10:48','11:30','12:25','13:52','14:28','15:41','16:40','17:34','18:28','19:41','20:38','21:40','22:30','23:35','24:27','25:27','26:32','27:44','28:31'], | |
| 'Romans' : ['1:32','2:29','3:31','4:25','5:21','6:23','7:25','8:39','9:33','10:21','11:36','12:21','13:14','14:23','15:33','16:27'], | |
| '1Corinthians' : ['1:31','2:16','3:23','4:21','5:13','6:20','7:40','8:13','9:27','10:33','11:34','12:31','13:13','14:40','15:58','16:24'], | |
| '2Corinthians' : ['1:24','2:17','3:18','4:18','5:21','6:18','7:16','8:24','9:15','10:18','11:33','12:21','13:14'], | |
| 'Galatians' : ['1:24','2:21','3:29','4:31','5:26','6:18'], | |
| 'Ephesians' : ['1:23','2:22','3:21','4:32','5:33','6:24'], | |
| 'Philippians' : ['1:30','2:30','3:21','4:23'], | |
| 'Colossians' : ['1:29','2:23','3:25','4:18'], | |
| '1Thessalonians' : ['1:10','2:20','3:13','4:18','5:28'], | |
| '2Thessalonians' : ['1:12','2:17','3:18'], | |
| '1Timothy' : ['1:20','2:15','3:16','4:16','5:25','6:21'], | |
| '2Timothy' : ['1:18','2:26','3:17','4:22'], | |
| 'Titus' : ['1:16','2:15','3:15'], | |
| 'Philemon' : ['1:25'], | |
| 'Hebrews' : ['1:14','2:18','3:19','4:16','5:14','6:20','7:28','8:13','9:28','10:39','11:40','12:29','13:25'], | |
| 'James' : ['1:27','2:26','3:18','4:17','5:20'], | |
| '1Peter' : ['1:25','2:25','3:22','4:19','5:14'], | |
| '2Peter' : ['1:21','2:22','3:18'], | |
| '1John' : ['1:10','2:29','3:24','4:21','5:21'], | |
| '2John' : ['1:13'], | |
| '3John' : ['1:14'], | |
| 'Jude' : ['1:25'], | |
| 'Revelation' : ['1:20','2:29','3:22','4:11','5:14','6:17','7:17','8:13','9:21','10:11','11:19','12:17','13:18','14:20','15:8','16:21','17:18','18:24','19:21','20:15','21:27','22:21'] | |
| } | |
| def splitversetext(snum, vtext): | |
| # function for splitting grouped verses, if exists | |
| global fzgroupingsreff | |
| global glava | |
| global numoflines | |
| global sline | |
| global c | |
| global dprefixes | |
| global bookslist | |
| global dabbrevbook | |
| global numtobibleabbrev | |
| global bib | |
| fzgroupingsreff.write(dprefixes[dabbrevbook[c]] + ';' + glava + ';' + snum + os.linesep) | |
| sline = '' | |
| l = vtext.split() | |
| lsize = len(l) | |
| try: | |
| ileft = int(snum.split('-')[0]) | |
| iright = int(snum.split('-')[1]) | |
| except: | |
| ileft = 200 | |
| iright = 200 | |
| diff = iright - ileft + 1 | |
| k = 0 | |
| j = int(lsize/diff) - 1 | |
| for i in range(ileft, iright + 1): | |
| if i != iright: | |
| sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:k + j + 1]) + os.linesep | |
| numoflines += 1 | |
| k += j + 1 | |
| else: | |
| sline += '{' + glava + ':' + str(i) + '} ' + ' '.join(l[k:]) + os.linesep | |
| numoflines += 1 | |
| now = datetime.datetime.now() | |
| currentdate = str(now).split()[0].replace('-', '') | |
| writeorappend = 'w' | |
| headers = {} | |
| headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" | |
| time1=time.time() | |
| ibiblecount = 0 | |
| # parsing Bible(s) | |
| for bib in pool: | |
| ibiblecount += 1 | |
| bib = str(bib) | |
| bib = bib.strip() | |
| directoryf = rootdir + numtobibleabbrev[bib].split('_')[0] + '/' + numtobibleabbrev[bib] + '/' | |
| if not os.path.exists(directoryf): | |
| os.makedirs(directoryf) | |
| if os.path.exists(directoryf + numtobibleabbrev[bib] + '_statusdone'): | |
| print(directoryf + numtobibleabbrev[bib] + ' already previously scrapped') | |
| continue | |
| directoryfxmls = xmlrootdir + numtobibleabbrev[bib].split('_')[0] + '/' | |
| if not os.path.exists(directoryfxmls): | |
| os.makedirs(directoryfxmls) | |
| filenamexmlzip = directoryfxmls + numtobibleabbrev[bib]+ '_xml_files_' + currentdate + '.zip' | |
| zxml = zipfile.ZipFile(filenamexmlzip, "w") | |
| time2 = time.time() | |
| errors=0 | |
| sfileintegrityname = directoryf + numtobibleabbrev[bib] + '_integrityinfos.txt' | |
| fintegrity = open(sfileintegrityname, writeorappend) #in this file will be added translation/chapter informations of integrity of text fails | |
| fintegrity.write('source: bible.com/versions' + os.linesep) | |
| szgroupingsreff = directoryf + numtobibleabbrev[bib] + '_groupingsreff.txt' | |
| fzgroupingsreff = open(szgroupingsreff, writeorappend ) #in this file will be added informations about grouped-splitted texts refferences | |
| groupings = 0 | |
| for c in bookslist: | |
| time3 = time.time() | |
| c = c.strip() | |
| sfilename = directoryf + dprefixes[dabbrevbook[c]] + '-' + dabbrevbook[c] + '.' + numtobibleabbrev[bib] + '.txt' | |
| g=codecs.open(sfilename, writeorappend, 'utf-8') | |
| #print() | |
| print('#' + str(ibiblecount) + '/' + str(len(pool)) + ' ' + bib + ' ' + sfilename.split('/')[-1]) | |
| lch = [] | |
| for i in range(booksdict[dabbrevbook[c]]): | |
| lch.append(i) | |
| # now we are going parsing chapters | |
| for i in lch: | |
| #print(str(i + 1), end = ' ') | |
| sys.stdout.flush() | |
| glava=str(i + 1) | |
| url = "http://bible.com/bible/" + bib.strip() + '/' + c.strip() + '.' + str(i + 1) | |
| try: | |
| req = urllib.request.Request(url, headers = headers) | |
| resp = urllib.request.urlopen(req) | |
| so = str(resp.read().decode('utf-8')) | |
| soup = BeautifulSoup(so, 'html.parser') | |
| breakchapter = False | |
| except: | |
| breakchapter = True | |
| # meaning of this is that chapter is not present on site, for scrapper to continue to work | |
| # chapter will be skipped | |
| if breakchapter: | |
| fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep) | |
| errors += 1 | |
| else: | |
| # narrow informations where Bible text is | |
| mydivs = soup.findAll("div", { "class" : "chapter" }) | |
| try: | |
| time.sleep(0.1) | |
| so = html.unescape(str(mydivs[0])) | |
| except: | |
| fintegrity.write('! ' + sfilename.split('/')[-1] + ' chapter: ' + glava + ', do not exists on site' + os.linesep) | |
| errors += 1 | |
| continue | |
| # prevention for future with saving part where text is from original xml files | |
| sfilenamexml = directoryfxmls + dprefixes[dabbrevbook[c]] + '-' + '%03d' % int(i + 1) + '-' + dabbrevbook[c] + '.' + numtobibleabbrev[bib] + '.xml' | |
| fxml = codecs.open(sfilenamexml, 'w', 'utf-8') | |
| fxml.write(html.unescape(so)) | |
| fxml.close() | |
| zxml.write(sfilenamexml, arcname = sfilenamexml.split('/')[-1]) | |
| os.remove(sfilenamexml) | |
| so = re.sub('<span class="label">\s*#\s*</span>', '', so, flags=re.UNICODE) | |
| so=so.replace('<span class="label">','SplittingForGod<span class="label">') | |
| lso=so.split('SplittingForGod') | |
| # in case there is heading before verse 1 | |
| siv1 = '' | |
| if '<span class="heading">' in lso[0]: | |
| isoup = BeautifulSoup(lso[0], 'html.parser') | |
| tagsitext = isoup.findAll("span", {"class" : "heading"}) | |
| for tag in tagsitext: | |
| siv1 += tag.text.strip() + ' ' | |
| siv1 = siv1.strip() | |
| g.write('{i' + glava + ':1} ' + siv1 + os.linesep) | |
| lso = lso[1:] | |
| numoflines = 0 | |
| # fix broken parts without "span", { "class" : "label" } tag | |
| ind = len(lso) - 1 | |
| while ind > 0: | |
| isoup = BeautifulSoup(lso[ind], 'html.parser') | |
| if isoup.find("span", { "class" : "label" }) != None: | |
| ind -= 1 | |
| else: | |
| lso[ind - 1] = lso[ind -1] + lso[ind] | |
| del lso[ind] | |
| ind -= 1 | |
| # fix groups like 22-23; 23-24 in Genesis 27 EN_MSG for result 22-24 | |
| lvn = [] | |
| for item in lso: | |
| isoup = BeautifulSoup(item, 'html.parser') | |
| lvn.append(isoup.find("span", { "class" : "label" }).text.strip()) | |
| ind = 0 | |
| if len(lvn) > 1: | |
| while ind < (len(lso) -1 ): | |
| if '-' in lvn[ind]: | |
| gn1 = lvn[ind].split('-')[1] | |
| gnleft = lvn[ind].split('-')[0] | |
| else: | |
| gn1 = lvn[ind] | |
| gnleft = lvn[ind] | |
| if '-' in lvn[ind + 1]: | |
| gn2 = lvn[ind + 1].split('-')[0] | |
| gnright = lvn[ind + 1].split('-')[1] | |
| else: | |
| gn2 = lvn[ind + 1] | |
| gnright = lvn[ind + 1] | |
| if gn1 == gn2: | |
| lso[ind + 1] = lso[ind] + ' ' + lso[ind + 1] | |
| del lso[ind] | |
| lvn[ind + 1] = gnleft + '-' + gnright | |
| del lvn[ind] | |
| isoup = BeautifulSoup(lso[ind], 'html.parser') | |
| isoup.find("span", { "class" : "label" }).extract() | |
| isoup.find("span", { "class" : "label" }).extract() | |
| lso[ind] = '<span class="label">' + gnleft + '-' + gnright + '</span>' + str(isoup) | |
| ind = ind - 1 | |
| ind += 1 | |
| # main | |
| for ind in range(len(lso)): | |
| isoup = BeautifulSoup(lso[ind], 'html.parser') | |
| vtag = isoup.find("span", { "class" : "label" }) | |
| FilteredVerseNum = vtag.text | |
| FilteredVerseNum = FilteredVerseNum.strip(' \n[]-') | |
| schecknum = '' | |
| for char in FilteredVerseNum: | |
| if char.isdigit() or (char == '-'): | |
| schecknum += char | |
| FilteredVerseNum = schecknum | |
| lt = [] | |
| content_tags = isoup.findAll("span", {"class" : "content"}) | |
| for tag in content_tags: | |
| lt.append(tag.text) | |
| FilteredVerseText = ' '.join(lt) | |
| FilteredVerseText = FilteredVerseText.strip() | |
| FilteredVerseText = re.sub(' +', ' ', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub(' ([?!.:;,])', r'\1', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub('([?!.:;,])(\w)', r'\1 \2', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub('([^ ])—', r'\1 —', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub('—([^ ])', r'— \1', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub('\[ ?\w+ ?\]', '', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub('( \d+,) (\d+[ ,;:.])', r'\1\2', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub('( \d+,) (\d+)$', r'\1\2', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = re.sub('([“ ‘])L ord', r'\1Lord', FilteredVerseText, flags=re.UNICODE) | |
| FilteredVerseText = FilteredVerseText.replace('“ ','“') | |
| FilteredVerseText = FilteredVerseText.replace(' ”','”') | |
| FilteredVerseText = FilteredVerseText.replace('‘ ','‘') | |
| FilteredVerseText = FilteredVerseText.replace(' ’','’') | |
| FilteredVerseText = FilteredVerseText.replace(" 's", "'s") | |
| if ('Footnotes for' in FilteredVerseText) and ind == (len(lso) -1): | |
| FilteredVerseText = re.sub('Footnotes for.*', '', FilteredVerseText, flags=re.UNICODE) | |
| if '-' in FilteredVerseNum: | |
| groupings += 1 | |
| splitversetext(FilteredVerseNum, FilteredVerseText) | |
| else: | |
| sline= '{' + glava + ':' + FilteredVerseNum + '} ' + FilteredVerseText + os.linesep | |
| numoflines += 1 | |
| if isoup.find("span", {"class" : "heading"}) != None: | |
| htags = isoup.findAll("span", {"class" : "heading"}) | |
| siv1 = '' | |
| for tag in htags: | |
| siv1 += tag.text.strip() + ' ' | |
| siv1 = siv1.strip() | |
| FilteredIntro = siv1 | |
| try: | |
| sline += '{i' + glava + ':' + str(int(FilteredVerseNum) +1) + '} ' + FilteredIntro + os.linesep | |
| except: | |
| try: | |
| sline += '{i' + glava + ':' + str(int(FilteredVerseNum.split('-')[1]) +1) + '} ' + FilteredIntro + os.linesep | |
| except: | |
| sline += '{i' + glava + ':' + str(200) + '} ' + FilteredIntro + os.linesep | |
| g.write(sline) | |
| trebalinii=int(Bibliaa[dabbrevbook[c]][int(glava)-1].split(':')[1]) | |
| if numoflines != trebalinii: | |
| fintegrity.write('! ' + sfilename.split('/')[-1] + ' ' + glava + ' ' + str(numoflines) + ' ' + str(trebalinii) + os.linesep) | |
| errors += 1 | |
| time.sleep(0.1 + random.uniform(0.01, 0.15)) | |
| #print() | |
| time4 = time.time() | |
| h1hours = int((time4-time3)/3600) | |
| h1min = int((time4-time3 - h1hours * 3600)/60) | |
| h1sec = time4 - time3 - h1hours * 3600 - h1min * 60 | |
| h2hours = int((time4-time2)/3600) | |
| h2min = int((time4-time2 - h2hours * 3600)/60) | |
| h2sec = time4 - time2 - h2hours * 3600 - h2min * 60 | |
| h3hours = int((time4-time1)/3600) | |
| h3min = int((time4-time1 - h3hours * 3600)/60) | |
| h3sec = time4 - time1 - h3hours * 3600 - h3min * 60 | |
| print("#%s/%s (%dh:%dm:%ds / %dh:%dm:%ds / #%s %dh:%dm:%ds)" % (str(ibiblecount), str(len(pool)), h1hours, h1min, h1sec, h2hours, h2min, h2sec, str(ibiblecount), h3hours, h3min, h3sec)) | |
| g.close() | |
| zxml.close() | |
| #print() | |
| #print('Total number of errors per chapter for ' + directoryf + numtobibleabbrev[bib] + ' : ' + str(errors) + os.linesep) | |
| print("#%s/%s Done %s %s %dh:%dm:%ds / %dh:%dm:%ds)" % (str(ibiblecount), str(len(pool)), bib, numtobibleabbrev[bib], h2hours, h2min, h2sec, h3hours, h3min, h3sec)) | |
| print('********') | |
| fintegrity.write('Total number of errors per chapter for ' + directoryf + numtobibleabbrev[bib] + ' : ' + str(errors) + os.linesep) | |
| fintegrity.close() | |
| fzgroupingsreff.close() | |
| fstatusdone = open(directoryf + numtobibleabbrev[bib] + '_statusdone', 'w') | |
| fstatusdone.close() | |
| print('Done!') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment